### RoBERTa Pre-processing & Fine-Tuning

In [1]:
#importing libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split


In [2]:
#checking for GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


In [3]:
#defining library for encoding the label
encoded_label_dict = {"CG" : 0, "OR" : 1}
def encode_label(x):
    return encoded_label_dict.get(x,-1)

In [4]:
#reading the dummy data
df = pd.read_csv("TrainingDataSet.csv")

In [5]:
#creating column "target" for encoded label result
df["target"] = df["label"].apply(lambda x: encode_label(x))

In [6]:
#configuring roberta base model
model_name = "roberta-base"
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05

In [7]:
#initializing tokenizer
tokenizer = RobertaTokenizer.from_pretrained(model_name)



In [8]:
#preparing data for training by tokenizing text and creating input tensors
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.text_[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.target[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [9]:
# Creating the dataset and dataloader
train_dataset, valid_dataset = train_test_split(df, test_size=0.2, shuffle=True, stratify=None, random_state=2021)
train_dataset = train_dataset.reset_index(drop=True)
valid_dataset = valid_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALID Dataset: {}".format(valid_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(valid_dataset, tokenizer, MAX_LEN)

FULL Dataset: (40432, 5)
TRAIN Dataset: (32345, 5)
VALID Dataset: (8087, 5)


In [10]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **valid_params)

In [11]:
model = RobertaForSequenceClassification.from_pretrained(model_name)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [12]:
# Creating the optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [13]:
# Function to calcuate the accuracy of the model
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [14]:
# Defining the training function on the 80% of the dataset for tuning the roberta model
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        
        optimizer.zero_grad()
        outputs = model(ids, attention_mask=mask, labels=targets)
        loss = outputs.loss
        logits = outputs.logits
        tr_loss += loss
        big_val, big_idx = torch.max(logits, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _!=0 and _%100==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 100 steps: {loss_step}")
            print(f"Training Accuracy per 100 steps: {accu_step}")

        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [15]:
# Defining the validation function on the 20% of the dataset for testing the roberta model
def valid(model, testing_loader):
    model.eval()
    n_correct = 0
    n_wrong = 0
    total = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, attention_mask=mask, labels=targets)
            loss = outputs.loss
            logits = outputs.logits
            tr_loss += loss
            big_val, big_idx = torch.max(logits, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _!=0 and _%100==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [16]:
tokenizer.pad_token_id

1

In [17]:
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 100 steps: 0.6204878687858582
Training Accuracy per 100 steps: 64.85148514851485
Training Loss per 100 steps: 0.5184564590454102
Training Accuracy per 100 steps: 72.76119402985074
Training Loss per 100 steps: 0.41927778720855713
Training Accuracy per 100 steps: 79.11129568106313
Training Loss per 100 steps: 0.36729440093040466
Training Accuracy per 100 steps: 82.07605985037407
Training Loss per 100 steps: 0.3251354694366455
Training Accuracy per 100 steps: 84.3063872255489
Training Loss per 100 steps: 0.3007788062095642
Training Accuracy per 100 steps: 85.64891846921797
Training Loss per 100 steps: 0.2724084258079529
Training Accuracy per 100 steps: 87.14336661911555
Training Loss per 100 steps: 0.25458914041519165
Training Accuracy per 100 steps: 88.04619225967541
Training Loss per 100 steps: 0.2413390725851059
Training Accuracy per 100 steps: 88.73473917869035
Training Loss per 100 steps: 0.2317320704460144
Training Accuracy per 100 steps: 89.31068931068931
Training

In [18]:
acc = valid(model, testing_loader)
print("Accuracy on validation data = %0.2f%%" % acc)

Validation Loss per 100 steps: 0.10914777219295502
Validation Accuracy per 100 steps: 96.41089108910892
Validation Loss per 100 steps: 0.1075654849410057
Validation Accuracy per 100 steps: 96.20646766169155
Validation Loss per 100 steps: 0.09874396026134491
Validation Accuracy per 100 steps: 96.30398671096346
Validation Loss per 100 steps: 0.1022157371044159
Validation Accuracy per 100 steps: 96.25935162094763
Validation Loss per 100 steps: 0.09976135194301605
Validation Accuracy per 100 steps: 96.20758483033931
Validation Loss per 100 steps: 0.09803064167499542
Validation Accuracy per 100 steps: 96.21464226289517
Validation Loss per 100 steps: 0.09710319340229034
Validation Accuracy per 100 steps: 96.27318116975749
Validation Loss per 100 steps: 0.09768930822610855
Validation Accuracy per 100 steps: 96.28589263420724
Validation Loss per 100 steps: 0.10152488946914673
Validation Accuracy per 100 steps: 96.1431742508324
Validation Loss per 100 steps: 0.10288810729980469
Validation Accur

In [20]:
# Save the model weights
output_model_file = 'C:/Users/Afzal Sufiya/Documents/MasterThesis/Review-Classification/FakeRoberta/roberta-pretrained1.pt'

model_to_save=model
torch.save(model_to_save, output_model_file)

print('All files saved')

All files saved


: 

### Inference

In [1]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import torch

In [4]:
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)



In [5]:
#loading saved weights
model = torch.load('C:/Users/Afzal Sufiya/Documents/MasterThesis/Review-Classification/FakeRoberta/roberta-pretrained.pt')

In [6]:
device="cuda"
query = """I work in the wedding industry and have to work long days, on my feet, outside in the heat, and have to look professional. I've spent a ridiculous amount of money on high end dress shoes like Merrels and just have not been able to find a pair that are comfortable to wear all day. Both for my feet and my back. Enter the Sanuk yoga sling!!! These shoes are amazingly comfortable. Though, I will admit it took a few wears to get used to the feel of the yoga matte bottom. At first, it felt a little "sticky" to me, and the fabric part that goes through the toe area was a little thick and took some getting used to. I wore them for a few days before taking them out on a job and I can't get over how comfortable they are. Ii have been wearing these shoes now for 3 months, every work day and I am THRILLED. No more back pain, no more sore feet. I also wear these sometimes during my off time,mans every time I wear them, I get compliments on how cute and comfortable they look. The great thing about these shoes is the yoga matte bottom. It helps your feet grip to the shoe a bit, so your foot can just walk normally, without having to grip the shoe. You may not realize it, but with a lot of Sandals, your foot is having to work to keep the shoe on, changing the way you walk and stand and ultimately causing foot and back pain. Not with these! Also, the soft linen sits comfortably on your skin and breathes nicely in the heat. The only downside is the funky tan lines, which is why I am sure to alternate shoes on my days off, especially if I plan to be outside for most of the day. If it were not for that, I think these might be the only shoes I'd wear all summer. If you are looking for a reasonable priced, comfortable shoe that you can wear and walk in all day."""
tokens = tokenizer.encode(query,return_tensors="pt")
all_tokens = len(tokens)
mask = torch.ones_like(tokens)

with torch.no_grad():
    logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
    probs = logits.softmax(dim=-1)

fake, real = probs.detach().cpu().flatten().numpy().tolist()

print(f"Real Probability: {real}\nFake Probability: {fake}")

Real Probability: 0.9975130558013916
Fake Probability: 0.0024869043845683336


In [7]:
device="cuda"
query = """My old bet was wearing this to the Macy's in January.  This is the first one I've ever had.  I am a 32D, and the first pair I bought were just a little tight.  I'm a bit disappointed.  This is my second pair.  I'm looking forward to wearing them to the Macy's in the fall.  I like the way they look.Love these!These are my favorite.  I have a hard time finding jeans that fit me comfortably, but I have a hard time finding jeans that don't fit.  These jeans are super comfortable and have a great price point.  I have some great jeans to wear for work, but these are the only jeans that I wear for work or for my family.  I will be buying more!  I have a lot of compliments on them.I love these shoes. I love the color and the fit. They fit my body well and are comfortable. I have a wide foot and these fit me well.

I'm 5'4", 130lbs and these fit well. I would recommend them.I wear a size 11.5 in jeans and this fits perfect. I have a narrow foot and this fits perfect. It is very comfortable and fits great. I bought a small and it fit perfectly. I will order another size up.I bought these for my husband, he loves them and he loves them!This is the best pair of sunglasses for the price!  They are so comfortable and easy to use.  I wear them all the time and they don't hurt my feet.  I wear them everyday and my feet are so happy with them!"""
tokens = tokenizer.encode(query,return_tensors="pt")
all_tokens = len(tokens[0])
mask = torch.ones_like(tokens)

with torch.no_grad():
    logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
    probs = logits.softmax(dim=-1)

fake, real = probs.detach().cpu().flatten().numpy().tolist()

print(f"Real Probability: {real}\nFake Probability: {fake}")

Real Probability: 0.0002426303835818544
Fake Probability: 0.9997573494911194


In [8]:
def predict(query, model, tokenizer, device="cuda"):
    tokens = tokenizer.encode(query)
    all_tokens = len(tokens)
    tokens = tokens[:tokenizer.model_max_length - 2]
    used_tokens = len(tokens)
    tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
    mask = torch.ones_like(tokens)

    with torch.no_grad():
        logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
        probs = logits.softmax(dim=-1)

    fake, real = probs.detach().cpu().flatten().numpy().tolist()
    return real

In [9]:
query = """I work in the wedding industry and have to work long days, on my feet, outside in the heat, and have to look professional. I've spent a ridiculous amount of money on high end dress shoes like Merrels and just have not been able to find a pair that are comfortable to wear all day. Both for my feet and my back. Enter the Sanuk yoga sling!!! These shoes are amazingly comfortable. Though, I will admit it took a few wears to get used to the feel of the yoga matte bottom. At first, it felt a little "sticky" to me, and the fabric part that goes through the toe area was a little thick and took some getting used to. I wore them for a few days before taking them out on a job and I can't get over how comfortable they are. Ii have been wearing these shoes now for 3 months, every work day and I am THRILLED. No more back pain, no more sore feet. I also wear these sometimes during my off time,mans every time I wear them, I get compliments on how cute and comfortable they look. The great thing about these shoes is the yoga matte bottom. It helps your feet grip to the shoe a bit, so your foot can just walk normally, without having to grip the shoe. You may not realize it, but with a lot of Sandals, your foot is having to work to keep the shoe on, changing the way you walk and stand and ultimately causing foot and back pain. Not with these! Also, the soft linen sits comfortably on your skin and breathes nicely in the heat. The only downside is the funky tan lines, which is why I am sure to alternate shoes on my days off, especially if I plan to be outside for most of the day. If it were not for that, I think these might be the only shoes I'd wear all summer. If you are looking for a reasonable priced, comfortable shoe that you can wear and walk in all day."""
predict(query,model,tokenizer)

0.9975046515464783

### Model Evalution

In [10]:
preds, preds_probas = [],[]
for i, row in valid_dataset.iterrows():
    query = row["text_"]
    pred = predict(query,model,tokenizer)
    preds_probas.append(pred)
    if pred >= 0.5:
        preds.append(1)
    else:
        preds.append(0)

NameError: name 'valid_dataset' is not defined

In [31]:
from sklearn.metrics import confusion_matrix
y_true = valid_dataset.target.values
y_pred = preds
confusion_matrix(y_true,y_pred)

array([[3935,   75],
       [ 181, 3896]], dtype=int64)

In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
acc = accuracy_score(y_true,y_pred)
precision = precision_score(y_true,y_pred)
recall = recall_score(y_true,y_pred)

In [33]:
print(f"Accuracy: {acc*100}; Precision:{precision*100}; Recall:{recall*100}")

Accuracy: 96.83442562136763; Precision:98.1113069755729; Recall:95.56046112337503


In [1]:
print(classification_report(y_true, y_pred, target_names=["CG","OR"]))

NameError: name 'classification_report' is not defined

### Predictions on Actual Dataset with trained model

In [None]:
import sqlite3

In [None]:
# Load the trained model weights
model_path = 'C:/Users/Afzal Sufiya/Documents/MasterThesis/Review-Classification/FakeRoberta/roberta-pretrained1.pt'
model = torch.load(model_path)
model.to(device)
model.eval()

In [None]:
# Function for loading data from database
def load_data_from_db(db_path, table_name):
    conn = sqlite3.connect(db_path)
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql(query, conn)
    conn.close()
    return df

# Function for saving data back to database
def save_predictions_to_db(db_path, table_name, df, result_column="result"):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Add a new column if it doesn't exist
    cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {result_column} REAL")
    
    # Update each row with the prediction
    for index, row in df.iterrows():
        cursor.execute(f"UPDATE {table_name} SET {result_column} = ? WHERE id = ?", (row[result_column], index + 1))
    
    conn.commit()
    conn.close()

# Loading the database
db_path = "C:/Users/Afzal Sufiya/Documents/MasterThesis/AmazonReviews-predicted.db"  
table_name = "reviews"  
new_df = load_data_from_db(db_path, table_name)

# column name used for prediction
text_column = "description"  


In [None]:
# Loading the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

MAX_LEN = 256
BATCH_SIZE = 8

# class for the dataset
class NewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, text_column):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.text_column = text_column
        
    def __getitem__(self, index):
        text = str(self.data[self.text_column][index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len


In [None]:
# Creating a DataLoader for the actual dataset
new_dataset = NewDataset(new_df, tokenizer, MAX_LEN, text_column)
new_params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 0}
new_loader = DataLoader(new_dataset, **new_params)

In [None]:
# prediction function
def predict(model, loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            outputs = model(ids, attention_mask=mask)
            logits = outputs.logits
            probs = logits.softmax(dim=-1)
            preds = probs[:, 1].cpu().numpy() 
            predictions.extend(preds)
    return predictions

# predicting for the actual dataset
predictions = predict(model, new_loader, device)

# Adding predictions to the dataframe
new_df["result"] = predictions

In [None]:
# Saving the predictions back to the database
save_predictions_to_db(db_path, table_name, new_df)

print("Predictions saved to the database")