## Roberta Pretrained Model

In [None]:
# Download necessary libraries
# !pip install transformers

# Importing necessary libraries 
import os
import re
import pandas as pd
import numpy as np
import string
import pandas as pd
import pickle
import gc
from tqdm import tqdm
from os import name
gc.collect()

# Model Creation and testing
import torch
import torch.nn as nn
from torch.utils import data
from  torch.utils.data  import Dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from torch.nn.modules import padding
from scipy.sparse import data
from transformers import AutoModelForSequenceClassification
from transformers import create_optimizer
from transformers import AdamW

# Sklearn 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

# Xgboost Classifier
from xgboost import XGBClassifier

# Natural Language Processing (NLP)
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_word =list(set(stopwords.words("english")))

# For GPU specific run
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Define boolean values for classification lables 
labels = {"fake":0, "true":1}

# Read csv files
def read_data(file1):
    """
    Read csv formated file and returns dataframe 
    """
    return pd.read_csv(file1)["text"]

# Construct dataframe
def construct_dataframe(df, type):
    """
    Concatenate pandas DataFrame objects along with a axis and set of logic
    """
    label_fake = pd.Series([labels[type]]*len(df),name="Labels")
    df_fake = pd.concat([df, label_fake],axis=1)
    return df_fake

# Preprocess the dataset
def preprocessing(df):

    """Preprocess raw data 
    
    Foundational approches for cleaning text data

    The cleanning is based on provided dataset

    Techniques
    ----------
        Lower Case.
        Remove Stop words.
        Remove punctuations. 
        Remove words less then length 2.

    Output
    ------
        Cleaned Data
    """
    df["new_sentence"] = None
    for idx, item in tqdm(df.iterrows()):
        lower_text = item["text"].lower()
        splitted_text =  lower_text.split()

        new_sent = []
        for word in splitted_text:
            if word not in stop_word:
              # Checking link related words
                word = re.sub(r'^https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE)
                # Checking ".com" and ".de" in data (Assuming .com and .de are major impacted words in raw data)
                if not "com" in word.split(".")[-1] or "de" in word.split(".")[-1]:
                    word = word.translate(str.maketrans('', '', string.punctuation))
                    if len(word) > 2:
                        new_sent.append(word)
        
        sentence = " ".join(new_sent)

        df["new_sentence"].iloc[idx] = sentence
    return df.drop("text", axis=1)                    

# Constructing Csv to Dataframe
df_fake = construct_dataframe(read_data("data/Fake.csv"), "fake")
df_real = construct_dataframe(read_data("data/True.csv"), "true")

# Preprocessing of constructed dataframe
df_prep_fake = preprocessing(df_fake)
df_prep_true = preprocessing(df_real)

# Combining the dataframe:
new_df = pd.concat([df_prep_fake, df_prep_true], axis=0)

# Shuffling the dataset otherwise when we divide or split the dataset it will split  it in bias manner for example 
# Validation will have data from only one class 
new_df = new_df.sample(frac=1)
# Export to new csv for further analysis
new_df.to_csv("data/new_dataset.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
23481it [01:05, 355.84it/s]
21417it [00:56, 380.86it/s]


In [None]:
# Creating tokenize dataset
class dataset(Dataset):

    """
    A tokenizer class created considering roberta-base pretrained model
    Presume Tokenized max length size 256 and padding to max length 

    returns
    -------
    tokenized dataset
    
    """
    def __init__(self, df: pd.DataFrame) -> None:
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("roberta-base")
        self.df =  df

    def __getitem__(self, index):
        data = self.df.iloc[index]
        label = data["Labels"]
        item  = data["new_sentence"]
          
        tokenized_data = self.tokenizer(item,\
             max_length=256, padding="max_length",\
             truncation=True,\
             return_tensors="pt")
        return {
            "input_ids": tokenized_data["input_ids"],
            "attention_mask": tokenized_data["attention_mask"], 
            "labels": label
        }

    def __len__(self):
        return len(self.df)


In [None]:
# Split dataset into train and test Train size = 0.7, Test_size = 0.3
def split_dataset(filename):
    df = pd.read_csv(filename)[:20000] # resample the dataset for faster run.
    train_x, test_x, train_y, test_y = train_test_split(df["new_sentence"], df["Labels"], test_size=0.3, stratify=df["Labels"]) # spplited test and train dataset in 0.7 and 0.3 ratio
    train_dataset = pd.concat([train_x, train_y], axis=1)
    test_dataset  = pd.concat([test_x, test_y], axis=1)
    train_dataset.dropna(inplace=True)
    test_dataset.dropna(inplace=True)
    train_dataset.to_csv("data/train.csv", index=False) # save train dataset locally to train.csv
    test_dataset.to_csv("data/test.csv",  index=False) # save test dataset locally to test.csv

# Examine F1_score and accuracy of the model
def f1_score_task(logits, ground_truth):
    """
    Created f1_score function for calculating F1_score and return numpy array  
    """
    _, pred = torch.max(logits, axis=-1)
    prediction = list(pred.detach().cpu().numpy())
    return f1_score(prediction, list(ground_truth.detach().cpu().numpy()))

def accuracy(logits, ground_truth):
    """
    Created accuracy function for calculating accuracy_score and return numpy array  
    """
    _, pred = torch.max(logits, axis=-1)
    prediction = list(pred.detach().cpu().numpy())
    return accuracy_score(prediction, list(ground_truth.detach().cpu().numpy()))

# Model Training Loop
def main_loop():

    """
    This main loop constructed considering pretrained "roberta-base" model
    AdamW optimizaer used for a model optimizer
    """
    train = pd.read_csv("data/train.csv")
    test = pd.read_csv("data/test.csv")
    # dataset 
    train_dataset = dataset(train)
    test_dataset  = dataset(test)
    # dataloader 
    train_dataloader  =  DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
    test_dataloader   =  DataLoader(test_dataset, batch_size=16, shuffle=True, drop_last=True)
    model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels = 2) # 768 * 2
    model.to(device)
    optim = AdamW(model.parameters(), lr=5e-5)

    for ep in tqdm(range(100)): #100 iterations
        total_loss = 0.0
        train_f1 = []
        train_acc = []
        for idx, data in tqdm(enumerate(train_dataloader)):
            optim.zero_grad()
            input_ids = data["input_ids"].to(device).squeeze()
            attention_mask = data["attention_mask"].to(device).squeeze()
            labels = data["labels"].to(device)
            # print(labels)
            outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
            loss = outputs[0]
            loss.backward()
            optim.step()
            total_loss += loss.item()
            train_f1.append(f1_score_task(outputs.logits, labels))
            train_acc.append(accuracy(outputs.logits, labels))
        if ep%20 == 0: # result after every 20 episodes
            # we will test 
            model.eval()
            with torch.no_grad():
                total_test_loss = 0.0
                test_f1 = []
                test_acc =[]

                for idx, data in tqdm(enumerate(test_dataloader)):
                    input_ids = data["input_ids"].to(device).squeeze()
                    attention_mask = data["attention_mask"].to(device).squeeze()
                    labels = data["labels"].to(device)
                    outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
                    total_test_loss += outputs[0].item()
                    test_f1.append(f1_score_task(outputs.logits, labels))
                    test_acc.append(accuracy(outputs.logits, labels))

                print(f'Train Loss {total_loss/len(train_dataloader)} and Test Loss {total_test_loss/ len(test_dataloader)}')
                print(f'Train F1 {np.array(train_f1).mean()} and Test F1 {np.array(test_f1).mean()}')
                print(f'Train Accuracy {np.array(train_acc).mean()} and Test Accuracy {np.array(test_acc).mean()}' )
    
    # save the model
    torch.save({
      "model_state": model.state_dict()  
    }, "huggingfacemodel.pth")

split_dataset("data/new_dataset.csv")
main_loop()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Train Loss 0.05807167809093804 and Test Loss 0.0038858782430432493
Train F1 0.9772485124922706 and Test F1 0.9985093763866143
Train Accuracy 0.9760174418604651 and Test Accuracy 0.998641304347826



0it [00:00, ?it/s][A
1it [00:00,  1.27it/s][A
2it [00:01,  1.26it/s][A
3it [00:02,  1.27it/s][A
4it [00:03,  1.28it/s][A
5it [00:03,  1.27it/s][A
6it [00:04,  1.27it/s][A
7it [00:05,  1.27it/s][A
8it [00:06,  1.27it/s][A
9it [00:07,  1.27it/s][A
10it [00:07,  1.27it/s][A
11it [00:08,  1.27it/s][A
12it [00:09,  1.26it/s][A
13it [00:10,  1.26it/s][A
14it [00:11,  1.26it/s][A
15it [00:11,  1.27it/s][A
16it [00:12,  1.27it/s][A
17it [00:13,  1.27it/s][A
18it [00:14,  1.27it/s][A
19it [00:14,  1.26it/s][A
20it [00:15,  1.27it/s][A
21it [00:16,  1.27it/s][A
22it [00:17,  1.27it/s][A
23it [00:18,  1.27it/s][A
24it [00:18,  1.27it/s][A
25it [00:19,  1.27it/s][A
26it [00:20,  1.27it/s][A
27it [00:21,  1.27it/s][A
28it [00:22,  1.27it/s][A
29it [00:22,  1.27it/s][A
30it [00:23,  1.27it/s][A
31it [00:24,  1.27it/s][A
32it [00:25,  1.27it/s][A
33it [00:25,  1.27it/s][A
34it [00:26,  1.27it/s][A
35it [00:27,  1.27it/s][A
36it [00:28,  1.27it/s][A
37it [00:29,  

Train Loss 2.438363772789883e-06 and Test Loss 0.003356436834542859
Train F1 1.0 and Test F1 0.9990878686530861
Train Accuracy 1.0 and Test Accuracy 0.9993206521739131



0it [00:00, ?it/s][A
1it [00:00,  1.25it/s][A
2it [00:01,  1.26it/s][A
3it [00:02,  1.26it/s][A
4it [00:03,  1.26it/s][A
5it [00:03,  1.27it/s][A
6it [00:04,  1.27it/s][A
7it [00:05,  1.27it/s][A
8it [00:06,  1.27it/s][A
9it [00:07,  1.27it/s][A
10it [00:07,  1.27it/s][A
11it [00:08,  1.27it/s][A
12it [00:09,  1.27it/s][A
13it [00:10,  1.26it/s][A
14it [00:11,  1.27it/s][A
15it [00:11,  1.27it/s][A
16it [00:12,  1.27it/s][A
17it [00:13,  1.27it/s][A
18it [00:14,  1.26it/s][A
19it [00:14,  1.26it/s][A
20it [00:15,  1.27it/s][A
21it [00:16,  1.27it/s][A
22it [00:17,  1.27it/s][A
23it [00:18,  1.27it/s][A
24it [00:18,  1.26it/s][A
25it [00:19,  1.26it/s][A
26it [00:20,  1.26it/s][A
27it [00:21,  1.26it/s][A
28it [00:22,  1.27it/s][A
29it [00:22,  1.27it/s][A
30it [00:23,  1.27it/s][A
31it [00:24,  1.27it/s][A
32it [00:25,  1.27it/s][A
33it [00:26,  1.27it/s][A
34it [00:26,  1.27it/s][A
35it [00:27,  1.27it/s][A
36it [00:28,  1.27it/s][A
37it [00:29,  

Train Loss 3.548728521542047e-07 and Test Loss 0.0038044104945956538
Train F1 1.0 and Test F1 0.9991862357591258
Train Accuracy 1.0 and Test Accuracy 0.9993206521739131



0it [00:00, ?it/s][A
1it [00:00,  1.27it/s][A
2it [00:01,  1.26it/s][A
3it [00:02,  1.26it/s][A
4it [00:03,  1.27it/s][A
5it [00:03,  1.26it/s][A
6it [00:04,  1.26it/s][A
7it [00:05,  1.26it/s][A
8it [00:06,  1.26it/s][A
9it [00:07,  1.27it/s][A
10it [00:07,  1.27it/s][A
11it [00:08,  1.27it/s][A
12it [00:09,  1.27it/s][A
13it [00:10,  1.27it/s][A
14it [00:11,  1.27it/s][A
15it [00:11,  1.26it/s][A
16it [00:12,  1.27it/s][A
17it [00:13,  1.27it/s][A
18it [00:14,  1.27it/s][A
19it [00:15,  1.27it/s][A
20it [00:15,  1.26it/s][A
21it [00:16,  1.26it/s][A
22it [00:17,  1.26it/s][A
23it [00:18,  1.27it/s][A
24it [00:18,  1.27it/s][A
25it [00:19,  1.27it/s][A
26it [00:20,  1.27it/s][A
27it [00:21,  1.26it/s][A
28it [00:22,  1.26it/s][A
29it [00:22,  1.27it/s][A
30it [00:23,  1.27it/s][A
31it [00:24,  1.27it/s][A
32it [00:25,  1.27it/s][A
33it [00:26,  1.27it/s][A
34it [00:26,  1.27it/s][A
35it [00:27,  1.26it/s][A
36it [00:28,  1.26it/s][A
37it [00:29,  

Train Loss 7.658503163942861e-09 and Test Loss 0.004238959449423714
Train F1 1.0 and Test F1 0.9993606138107417
Train Accuracy 1.0 and Test Accuracy 0.9993206521739131



0it [00:00, ?it/s][A
1it [00:00,  1.26it/s][A
2it [00:01,  1.27it/s][A
3it [00:02,  1.27it/s][A
4it [00:03,  1.27it/s][A
5it [00:03,  1.27it/s][A
6it [00:04,  1.27it/s][A
7it [00:05,  1.27it/s][A
8it [00:06,  1.27it/s][A
9it [00:07,  1.27it/s][A
10it [00:07,  1.26it/s][A
11it [00:08,  1.26it/s][A
12it [00:09,  1.26it/s][A
13it [00:10,  1.26it/s][A
14it [00:11,  1.27it/s][A
15it [00:11,  1.27it/s][A
16it [00:12,  1.27it/s][A
17it [00:13,  1.27it/s][A
18it [00:14,  1.27it/s][A
19it [00:15,  1.27it/s][A
20it [00:15,  1.27it/s][A
21it [00:16,  1.27it/s][A
22it [00:17,  1.26it/s][A
23it [00:18,  1.27it/s][A
24it [00:18,  1.27it/s][A
25it [00:19,  1.27it/s][A
26it [00:20,  1.27it/s][A
27it [00:21,  1.27it/s][A
28it [00:22,  1.27it/s][A
29it [00:22,  1.27it/s][A
30it [00:23,  1.27it/s][A
31it [00:24,  1.27it/s][A
32it [00:25,  1.27it/s][A
33it [00:26,  1.27it/s][A
34it [00:26,  1.27it/s][A
35it [00:27,  1.27it/s][A
36it [00:28,  1.27it/s][A
37it [00:29,  

Train Loss 2.772308821784405e-10 and Test Loss 0.004527536551253486
Train F1 1.0 and Test F1 0.9992622467047019
Train Accuracy 1.0 and Test Accuracy 0.9993206521739131



0it [00:00, ?it/s][A
1it [00:00,  1.27it/s][A
2it [00:01,  1.27it/s][A
3it [00:02,  1.26it/s][A
4it [00:03,  1.26it/s][A
5it [00:03,  1.26it/s][A
6it [00:04,  1.26it/s][A
7it [00:05,  1.26it/s][A
8it [00:06,  1.27it/s][A
9it [00:07,  1.27it/s][A
10it [00:07,  1.26it/s][A
11it [00:08,  1.26it/s][A
12it [00:09,  1.26it/s][A
13it [00:10,  1.26it/s][A
14it [00:11,  1.27it/s][A
15it [00:11,  1.27it/s][A
16it [00:12,  1.27it/s][A
17it [00:13,  1.27it/s][A
18it [00:14,  1.26it/s][A
19it [00:15,  1.27it/s][A
20it [00:15,  1.27it/s][A
21it [00:16,  1.27it/s][A
22it [00:17,  1.27it/s][A
23it [00:18,  1.27it/s][A
24it [00:18,  1.26it/s][A
25it [00:19,  1.26it/s][A
26it [00:20,  1.26it/s][A
27it [00:21,  1.27it/s][A
28it [00:22,  1.27it/s][A
29it [00:22,  1.26it/s][A
30it [00:23,  1.26it/s][A
31it [00:24,  1.27it/s][A
32it [00:25,  1.27it/s][A
33it [00:26,  1.27it/s][A
34it [00:26,  1.27it/s][A
35it [00:27,  1.27it/s][A
36it [00:28,  1.27it/s][A
37it [00:29,  

In [None]:
# ### Result
# # Roberta-base model iteration result/ summary with model train test loss, f1_score and accuracy 
# """
# 1%|          | 1/100 [03:18<5:27:39, 198.58s/it]Train Loss 0.05807167809093804 and Test Loss 0.0038858782430432493
# Train F1 0.9772485124922706 and Test F1 0.9985093763866143
# Train Accuracy 0.9760174418604651 and Test Accuracy 0.998641304347826

# 21%|██        | 21/100 [1:00:22<3:55:32, 178.89s/it]Train Loss 2.438363772789883e-06 and Test Loss 0.003356436834542859
# Train F1 1.0 and Test F1 0.9990878686530861
# Train Accuracy 1.0 and Test Accuracy 0.9993206521739131

# 41%|████      | 41/100 [1:57:30<2:55:59, 178.97s/it]Train Loss 3.548728521542047e-07 and Test Loss 0.0038044104945956538
# Train F1 1.0 and Test F1 0.9991862357591258
# Train Accuracy 1.0 and Test Accuracy 0.9993206521739131

# 61%|██████    | 61/100 [2:54:39<1:56:17, 178.91s/it]Train Loss 7.658503163942861e-09 and Test Loss 0.004238959449423714
# Train F1 1.0 and Test F1 0.9993606138107417
# Train Accuracy 1.0 and Test Accuracy 0.9993206521739131

# 81%|████████  | 81/100 [3:51:51<56:41, 179.04s/it]Train Loss 2.772308821784405e-10 and Test Loss 0.004527536551253486
# Train F1 1.0 and Test F1 0.9992622467047019
# Train Accuracy 1.0 and Test Accuracy 0.9993206521739131

# 100%|██████████| 100/100 [4:45:44<00:00, 171.45s/it]

# """