In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import preprocessor as p

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
encoded_label_dict = {"negative" : 0, "positive" : 1}
def encode_label(x):
    return encoded_label_dict.get(x,-1)

In [5]:
def preprocess(txt):
    return p.clean(txt)

In [6]:
fp = "../../data/rand_prob_0.6.csv"
df = pd.read_csv(fp)

In [7]:
df.head()

Unnamed: 0,review,sentiment,target,clean_review,random_prob_target
0,One of the other reviewers has mentioned that ...,positive,1,One of the other reviewers has mentioned that ...,0
1,A wonderful little production. <br /><br />The...,positive,1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,positive,1,I thought this was a wonderful way to spend ti...,0
3,Basically there's a family where a little boy ...,negative,0,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [8]:
n_correct_labels_by_chance = len(df[df.target==df.random_prob_target])
n_samples = len(df)
correct_rate = (n_correct_labels_by_chance/n_samples)*100
print(f"Correct rate for the randomized dataset is: {correct_rate}")

Correct rate for the randomized dataset is: 59.785999999999994


In [9]:
model_name = "roberta-base"
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 2
LEARNING_RATE = 1e-05

In [10]:
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [11]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.clean_review[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.random_prob_target[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [12]:
train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, stratify=None, random_state=2022)
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, stratify=None, random_state=2022)

In [13]:
# Creating the dataset and dataloader
train_dataset = train.reset_index(drop=True)
valid_dataset = valid.reset_index(drop=True)
test_dataset = test.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALID Dataset: {}".format(valid_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
validating_set = Triage(valid_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (50000, 5)
TRAIN Dataset: (40000, 5)
VALID Dataset: (5000, 5)
TEST Dataset: (5000, 5)


In [14]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }
test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validating_loader = DataLoader(validating_set, **valid_params)
testing_loader = DataLoader(testing_set, **test_params)

In [15]:
# Function to calcuate the accuracy of the model
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [16]:
def validate(model, testing_loader):
    model.eval()
    n_correct = 0
    n_wrong = 0
    total = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    state = torch.get_rng_state()
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, attention_mask=mask, labels=targets)
            loss = outputs.loss
            logits = outputs.logits
            tr_loss += loss
            big_val, big_idx = torch.max(logits, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    torch.set_rng_state(state)
    return epoch_loss, epoch_accu

In [17]:
# Defining the training function on the 80% of the dataset for tuning the roberta model
def train(epoch, training_loader, testing_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        
        optimizer.zero_grad()
        outputs = model(ids, attention_mask=mask, labels=targets)
        loss = outputs.loss
        logits = outputs.logits
        tr_loss += loss
        big_val, big_idx = torch.max(logits, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _!=0 and _%100==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 100 steps: {loss_step}")
            print(f"Training Accuracy per 100 steps: {accu_step}")

        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
    
    valid_loss, valid_accu = validate(model,testing_loader)
    return model, epoch_loss, epoch_accu, valid_loss, valid_accu

In [18]:
torch.manual_seed(2022)

model = RobertaForSequenceClassification.from_pretrained(model_name)
model.to(device)

# Creating the optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

running_val_loss = float('inf')
running_trn_loss = float('inf')
trn_losses = []
val_losses = []
for epoch in range(EPOCHS):
    m, trn_loss, trn_acc, val_loss, val_acc = train(epoch, training_loader, validating_loader)
    trn_losses.append(trn_loss)
    val_losses.append(val_loss)
    if (val_loss < running_val_loss) and (val_loss < trn_loss):
        running_val_loss = val_loss
        running_trn_loss = trn_loss
        # Save the best model
        output_model_file = f'../../models/best-ft-roberta-imdb-sentiment-maxlen256-bs8-randomized_0.6.pt'
        model_to_save = m
        torch.save(model_to_save, output_model_file)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Training Loss per 100 steps: 0.7002657651901245
Training Accuracy per 100 steps: 47.648514851485146
Training Loss per 100 steps: 0.6974096298217773
Training Accuracy per 100 steps: 49.440298507462686
Training Loss per 100 steps: 0.6962448358535767
Training Accuracy per 100 steps: 50.20764119601329
Training Loss per 100 steps: 0.6955511569976807
Training Accuracy per 100 steps: 50.249376558603494
Training Loss per 100 steps: 0.6957535743713379
Training Accuracy per 100 steps: 50.199600798403196
Training Loss per 100 steps: 0.6949617266654968
Training Accuracy per 100 steps: 50.707154742096506
Training Loss per 100 steps: 0.6947634816169739
Training Accuracy per 100 steps: 51.12339514978602
Training Loss per 100 steps: 0.6948354244232178
Training Accuracy per 100 steps: 50.842696629213485
Training Loss per 100 steps: 0.6947222948074341
Training Accuracy per 100 steps: 50.86015538290788
Training Loss per 100 steps: 0.6946422457695007
Training Accuracy per 100 steps: 50.886613386613384
Tra

RuntimeError: [enforce fail at inline_container.cc:274] . unexpected pos 335649408 vs 335649296

In [29]:
output_model_file = f'../../models/best-ft-roberta-imdb-sentiment-maxlen256-bs8-randomized_0.6.pt'
torch.save(model, output_model_file)

In [None]:
# Plot Epoch vs Loss Graph

#### Inference

In [6]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import torch

In [7]:
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [8]:
model = torch.load('/media/lazylearner/Data/joni/nts/best-ft-roberta-imdb-sentiment-maxlen256-bs8-randomized_0.8.pt')

In [19]:
def predict(query, model, tokenizer, device="cuda"):
    tokens = tokenizer.encode(query)
    all_tokens = len(tokens)
    tokens = tokens[:tokenizer.model_max_length - 2]
    used_tokens = len(tokens)
    tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
    mask = torch.ones_like(tokens)

    with torch.no_grad():
        logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
        probs = logits.softmax(dim=-1)

    fake, real = probs.detach().cpu().flatten().numpy().tolist()
    return real

In [20]:
preds, preds_probas = [],[]
for i, row in test_dataset.iterrows():
    query = row["clean_review"]
    pred = predict(query,model,tokenizer)
    preds_probas.append(pred)
    if pred >= 0.5:
        preds.append(1)
    else:
        preds.append(0)

Token indices sequence length is longer than the specified maximum sequence length for this model (803 > 512). Running this sequence through the model will result in indexing errors


#### Performance on probabilitiscally randomized test set

In [21]:
from sklearn.metrics import confusion_matrix
y_true = test_dataset.random_prob_target.values
y_pred = preds
confusion_matrix(y_true,y_pred)

array([[1207, 1317],
       [ 860, 1616]])

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score
acc = accuracy_score(y_true,y_pred)
precision = precision_score(y_true,y_pred,zero_division=1)
recall = recall_score(y_true,y_pred)
f1 = f1_score(y_true,y_pred,zero_division=1)

In [23]:
print(f"Accuracy: {acc*100}; Precision:{precision*100}; Recall:{recall*100}; F1-Score:{f1*100}")

Accuracy: 56.46; Precision:55.09717013296965; Recall:65.26655896607431; F1-Score:59.75226474394527


In [24]:
print(classification_report(y_true, y_pred, target_names=["negative","positive"],zero_division=1))

              precision    recall  f1-score   support

    negative       0.58      0.48      0.53      2524
    positive       0.55      0.65      0.60      2476

    accuracy                           0.56      5000
   macro avg       0.57      0.57      0.56      5000
weighted avg       0.57      0.56      0.56      5000



#### Performance on real test set

In [25]:
from sklearn.metrics import confusion_matrix
y_true = test_dataset.target.values
y_pred = preds
confusion_matrix(y_true,y_pred)

array([[1987,  551],
       [  80, 2382]])

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score
acc = accuracy_score(y_true,y_pred)
precision = precision_score(y_true,y_pred,zero_division=1)
recall = recall_score(y_true,y_pred)
f1 = f1_score(y_true,y_pred,zero_division=1)

In [27]:
print(f"Accuracy: {acc*100}; Precision:{precision*100}; Recall:{recall*100}; F1-Score:{f1*100}")

Accuracy: 87.38; Precision:81.21377429253324; Recall:96.75060926076361; F1-Score:88.30398517145504


In [28]:
print(classification_report(y_true, y_pred, target_names=["negative","positive"],zero_division=1))

              precision    recall  f1-score   support

    negative       0.96      0.78      0.86      2538
    positive       0.81      0.97      0.88      2462

    accuracy                           0.87      5000
   macro avg       0.89      0.88      0.87      5000
weighted avg       0.89      0.87      0.87      5000

