## About

In the previous <a href="https://github.com/Alkhaddour/Natural-Language-Inference-NLI-/blob/main/Training%20RoBERTa-based%20model%20using%20STS.ipynb">notebook</a>, we trained a RoBERTa-based model using STS benchmark. In this notebook, we will fine-tune the same model using <a href="https://gluebenchmark.com/">GLUE</a> benchmark (mainly <a href="https://www.microsoft.com/en-us/download/details.aspx?id=52398">MRPC</a> dataset).


#### Load Libraries

In [1]:
import sys
import torch
from datasets import load_dataset
import numpy as np
from torch.utils.data import RandomSampler, DataLoader, SequentialSampler, TensorDataset, Dataset
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
from colorama import Fore
import pandas as pd
import csv
import os
from datetime import datetime

#### Set accelerator

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)

cuda


#### Set data path

In [3]:
output_dir = './models'
pretrained_STS= 'pretrained_STS.pkl'
fineTunedModel = 'model.pkl'
FROM_FILE = False

In [4]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### Set model params

In [5]:
batch_size = 16           
max_seq_length = 128
lr1 = 1e-3             # learning rate while training the additional layers
lr2 = 1e-6             # learning rate while training the whole model
NUM_EPOCHS_1 = 25      # Number of epochs used to train new layers
NUM_EPOCHS_2 = 50      # Number of epochs used to train the whole new layers
dropout_rate = 0.30

#### Set tokenizer

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
len(tokenizer)

50265

#### Define dataset reader

In [7]:
class FeatureExtractor:
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
    def truncate_pair_of_tokens(self, tokens_a, tokens_b):
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= self.max_len - 3:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()
     
    def build_features(self, example):
        tokens_a = tokenizer.tokenize(example["sentence1"])
        tokens_b = tokenizer.tokenize(example["sentence2"])
        self.truncate_pair_of_tokens(tokens_a, tokens_b)
        tokens = []
        #tokens.append("[CLS]")
        tokens.append(tokenizer.cls_token)
        for token in tokens_a:
            tokens.append(token)
        #tokens.append("[SEP]")
        tokens.append(tokenizer.sep_token) 
        for token in tokens_b:
            tokens.append(token)
        #tokens.append("[SEP]")
        tokens.append(tokenizer.sep_token)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_len:
            input_ids.append(0)
            input_mask.append(0)

        input_ids   = torch.tensor(input_ids, dtype=torch.int64).to(device)
        input_mask  = torch.tensor(input_mask, dtype=torch.float).to(device)
        label       = torch.tensor(example["label"], dtype=torch.int64).to(device)
        return (input_ids, input_mask, label)       

In [8]:
class glueDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, max_len, tokenizer):
        self.max_len = max_len 
        self.dataset = dataset
        self.feature_extractor =  FeatureExtractor(tokenizer, max_len)
        

    def __getitem__(self, idx):
        example = self.dataset[idx]
        input_ids, input_mask, label = self.feature_extractor.build_features(example)
        return (input_ids, input_mask, label)

    def __len__(self):
        return len(self.dataset)

#### Define datasets & Data loaders

In [9]:
raw_train_data = load_dataset('glue', 'mrpc', split='train')
raw_val_data   = load_dataset('glue', 'mrpc', split='validation')
raw_test_data  = load_dataset('glue', 'mrpc', split='test')

Reusing dataset glue (C:\Users\alkha\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (C:\Users\alkha\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (C:\Users\alkha\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [10]:
train_dataset = glueDataset(raw_train_data, max_seq_length,tokenizer)
val_dataset   = glueDataset(raw_val_data,   max_seq_length,tokenizer)

In [11]:
train_data_loader = DataLoader(train_dataset, shuffle=True,  batch_size=batch_size)
val_data_loader   = DataLoader(val_dataset,   shuffle=False, batch_size=batch_size)

#### Results manager
Results manager is used to load and save model weights

In [12]:
class ResultsManager():
    def __init__(self, train_len, val_len, output_dir):
        self.train_losses  = []
        self.val_losses    = []
        self.steps         = []
        self.best_val_loss = float('Inf')
        self.train_len     = train_len
        self.val_len       = val_len
        self.output_dir    = output_dir
          
    def save_checkpoint(self, filename, model, valid_loss):
        torch.save({'model_state_dict': model.state_dict(),'valid_loss': valid_loss}, os.path.join(self.output_dir, filename))

    def load_checkpoint(self, filename, model):    
        state_dict = torch.load(os.path.join(self.output_dir , filename), map_location=device)
        model.load_state_dict(state_dict['model_state_dict'])
        return state_dict['valid_loss']

    def save_metrics(self, filename):   
        state_dict = {'train_losses': self.train_losses,
                      'val_losses': self.val_losses,
                      'steps': self.steps}

        torch.save(state_dict, os.path.join(self.output_dir, filename))
  
    def load_metrics(self, filename):    
        state_dict = torch.load(os.path.join(self.output_dir , filename), map_location=device)
        return state_dict['train_losses'], state_dict['val_losses'], state_dict['steps']

    def update_train_val_loss(self, model, train_loss, val_loss, step, epoch, num_epochs):
        train_loss = train_loss 
        val_loss   = val_loss  
        self.train_losses.append(train_loss)
        self.val_losses.append(val_loss)
        self.steps.append(step)
        print('Epoch [{}/{}], step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
              .format(epoch+1, num_epochs, step, num_epochs * self.train_len, train_loss, val_loss))
    
        # checkpoint
        if self.best_val_loss > val_loss:
            self.best_val_loss = val_loss
            self.save_checkpoint('FineTuned_model.pkl', model, self.best_val_loss)
            self.save_metrics('FineTuned_metric.pkl')
            
results = ResultsManager(len(train_dataset), len(val_dataset), output_dir)

#### Define Model

Before we can fine-tune the model on MRPC dataset, we need to make a change in the last layer, because STS benchmark designates a regression problem (i.e. the input is two sentences, and the output is the similarity between them) while MRPC designates a classification probelm.

From predefined model, we will take all layers, except the last one, which will be replaced with one for classification

In [13]:
class ROBERTAClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(ROBERTAClassifier, self).__init__()        
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.d1      = torch.nn.Dropout(dropout_rate)
        self.l1      = torch.nn.Linear(768, 128)
        self.bn1     = torch.nn.LayerNorm(128)
        self.d2      = torch.nn.Dropout(dropout_rate)
        self.l2      = torch.nn.Linear(128, 1)                # This layer will be ignored in forward
        torch.nn.init.xavier_uniform_(self.l1.weight)
        torch.nn.init.xavier_uniform_(self.l2.weight)
        
    def forward(self, input_ids, attention_mask):
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x    = self.d1(x)
        x    = self.l1(x)
        x    = self.bn1(x)
        x    = torch.nn.Tanh()(x)
        x    = self.d2(x)
        return x

##### Fine-Tuning: From regression to classification
Just add the classification layer

In [14]:
class ROBERTAClassifier_2(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(ROBERTAClassifier_2, self).__init__()   
        self.part_model = ROBERTAClassifier(dropout_rate)
        results.load_checkpoint(pretrained_STS, self.part_model)
        self.l2 = torch.nn.Linear(128, 2)
        self.l3 = torch.nn.Softmax(1)
        torch.nn.init.xavier_uniform_(self.l2.weight)
        
    def forward(self, input_ids, attention_mask):
        x    = self.part_model(input_ids=input_ids, attention_mask=attention_mask)
        x    = self.l2(x)  
        if self.training == False:
            x  = self.l3(x)
        return x

#### Main train method

In [15]:
model = ROBERTAClassifier_2(dropout_rate)
model.to(device)
print("Model on", device)

Model on cuda


In [16]:
def train(model, optimizer, train_data_loader, val_data_loader, results, 
          scheduler = None, num_epochs = 5 , train_whole_model = False):
    step = 0
    # if we want to train all the model (our added layers + RoBERTa)
    if train_whole_model:
        for param in model.part_model.parameters():
            param.requires_grad = True
    # in case we just want to train our added layer.
    else:
        for param in model.part_model.parameters():
            param.requires_grad = False
    
    model.train()
    
    for epoch in range(num_epochs):
        train_loss  = 0.0                
        val_loss    = 0.0
        batch_count = 0
        for (input_ids, input_mask, y_true) in train_data_loader:
            y_pred = model(input_ids = input_ids, attention_mask = input_mask) 
            loss = torch.nn.CrossEntropyLoss()(y_pred, y_true)
            loss.backward()
            # Optimizer and scheduler step
            optimizer.step()    
            scheduler.step()
            optimizer.zero_grad()
            # Update train loss and step
            train_loss += loss.item()
            step += batch_size
            batch_count+=1
        train_loss /= batch_count
        model.eval()
        with torch.no_grad():      
            batch_count = 0
            for (input_ids, input_mask, y_true) in val_data_loader:
                y_pred = model(input_ids = input_ids, attention_mask = input_mask) 
                loss = torch.nn.CrossEntropyLoss()(y_pred, y_true)
                val_loss += loss.item()
                batch_count+=1
            val_loss /= batch_count
        results.update_train_val_loss(model, train_loss, val_loss, step, epoch, num_epochs)       
        model.train()

    results.save_metrics('FineTuned_metric.pkl')

#### Create or load Model

In [17]:
results = ResultsManager(len(train_dataset), len(val_dataset), output_dir)
steps_per_epoch = len(train_dataset)

In [18]:
if FROM_FILE:
    model = ROBERTAClassifier_2(dropout_rate)
    results.load_checkpoint(fineTunedModel, model)
else:    
    model = ROBERTAClassifier_2(dropout_rate)

model = model.to(device)

#### Train new layers

##### Define optimizer and scheduler

In [19]:
optimizer = AdamW(model.parameters(), lr=lr1)
scheduler = get_linear_schedule_with_warmup(optimizer,  num_warmup_steps=steps_per_epoch * 1, num_training_steps = steps_per_epoch * NUM_EPOCHS_1)

In [20]:
print(f"[{datetime.now()}] -- Training new layers started")
train(model=model, train_data_loader=train_data_loader, val_data_loader=val_data_loader, optimizer=optimizer, 
      results = results, scheduler=scheduler, num_epochs=NUM_EPOCHS_1, train_whole_model = False)
print(f"[{datetime.now()}] -- Training new layers ended")

[2021-06-11 14:14:53.812779] -- Training new layers started
Epoch [1/25], step [3680/91700], Train Loss: 0.6520, Valid Loss: 0.6463
Epoch [2/25], step [7360/91700], Train Loss: 0.6415, Valid Loss: 0.6382
Epoch [3/25], step [11040/91700], Train Loss: 0.6333, Valid Loss: 0.6332
Epoch [4/25], step [14720/91700], Train Loss: 0.6196, Valid Loss: 0.6273
Epoch [5/25], step [18400/91700], Train Loss: 0.6047, Valid Loss: 0.6197
Epoch [6/25], step [22080/91700], Train Loss: 0.5955, Valid Loss: 0.6126
Epoch [7/25], step [25760/91700], Train Loss: 0.5792, Valid Loss: 0.6072
Epoch [8/25], step [29440/91700], Train Loss: 0.5601, Valid Loss: 0.5975
Epoch [9/25], step [33120/91700], Train Loss: 0.5531, Valid Loss: 0.5895
Epoch [10/25], step [36800/91700], Train Loss: 0.5421, Valid Loss: 0.5813
Epoch [11/25], step [40480/91700], Train Loss: 0.5337, Valid Loss: 0.5788
Epoch [12/25], step [44160/91700], Train Loss: 0.5246, Valid Loss: 0.5704
Epoch [13/25], step [47840/91700], Train Loss: 0.5171, Valid Lo

#### Clear GPU cache 

In [21]:
torch.cuda.empty_cache()

#### Train the whole model

##### Define optimizer and scheduler

In [22]:
optimizer = AdamW(model.parameters(), lr=lr2)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=steps_per_epoch*2,  num_training_steps=steps_per_epoch*NUM_EPOCHS_2)

In [23]:
print(f"[{datetime.now()}] -- Training whole layers started")
train(model=model,  train_data_loader=train_data_loader,  val_data_loader=val_data_loader,  optimizer=optimizer, 
      results = results,  scheduler=scheduler,  num_epochs=NUM_EPOCHS_2, train_whole_model=True)
print(f"[{datetime.now()}] -- Training whole layers ended")

[2021-06-11 14:22:25.908996] -- Training whole layers started
Epoch [1/50], step [3680/183400], Train Loss: 0.4982, Valid Loss: 0.5445
Epoch [2/50], step [7360/183400], Train Loss: 0.4943, Valid Loss: 0.5413
Epoch [3/50], step [11040/183400], Train Loss: 0.4838, Valid Loss: 0.5358
Epoch [4/50], step [14720/183400], Train Loss: 0.4800, Valid Loss: 0.5284
Epoch [5/50], step [18400/183400], Train Loss: 0.4590, Valid Loss: 0.5197
Epoch [6/50], step [22080/183400], Train Loss: 0.4449, Valid Loss: 0.5098
Epoch [7/50], step [25760/183400], Train Loss: 0.4257, Valid Loss: 0.4988
Epoch [8/50], step [29440/183400], Train Loss: 0.4061, Valid Loss: 0.4877
Epoch [9/50], step [33120/183400], Train Loss: 0.3840, Valid Loss: 0.4770
Epoch [10/50], step [36800/183400], Train Loss: 0.3676, Valid Loss: 0.4669
Epoch [11/50], step [40480/183400], Train Loss: 0.3576, Valid Loss: 0.4589
Epoch [12/50], step [44160/183400], Train Loss: 0.3325, Valid Loss: 0.4526
Epoch [13/50], step [47840/183400], Train Loss: 0

### Model Accuracy

In [24]:
def test_single_example(example, model, max_len):
    feature_extractor = FeatureExtractor(tokenizer, max_len)
    model.eval()
    input_ids,input_mask,_ = feature_extractor.build_features(example)
    input_ids = input_ids.reshape(1, -1)
    input_mask = input_mask.reshape(1, -1)
    result = model(input_ids=input_ids.to(device), attention_mask=input_mask.to(device))
    result = result[0].detach().cpu()
    return result[0].detach().cpu().numpy() , torch.argmax(result).numpy() 

In [25]:
def test_and_print_example(example, model, max_seq_length):
    _, result = test_single_example(example, model, max_seq_length)
    print (f"Sentence1: {example['sentence1']}")
    print (f"Sentence2: {example['sentence2']}")
    print (f"Predicted label = {result}")
    print (f"  Correct label = {example['label']}")

### Train Accuracy

In [26]:
cnt = 0
for i in range(len(raw_train_data)):
    example = raw_train_data[i]
    prob, result  = test_single_example(example, model, max_seq_length)
    if example['label'] == result:
        cnt += 1

print(f"Train Accuracy = {cnt/len(raw_train_data)}")

Train Accuracy = 0.9991821155943293


#### Testing model

In [27]:
cnt = 0
for i in range(len( raw_test_data)):
    example = raw_test_data[i]
    _, result  = test_single_example(example, model, max_seq_length)
    if example['label'] == result:
        cnt += 1

print(f"Test Accuracy = {cnt/len(raw_test_data)}")

Test Accuracy = 0.8782608695652174


#### Testing different examples

In [28]:
example = {
    'sentence1': "Syria is a small country with great kitchen",
    'sentence2': "Syria is a beautiful country with delicious kitchen",
    'label' : 1
}

test_and_print_example(example, model, max_seq_length)


Sentence1: Syria is a small country with great kitchen
Sentence2: Syria is a beautiful country with delicious kitchen
Predicted label = 1
  Correct label = 1


In [29]:
example = {
    'sentence1': "Syria is a small country with great kitchen",
    'sentence2': 'ITMO is a descent university, with great professors',
    'label' : 0
}
test_and_print_example(example, model, max_seq_length)

Sentence1: Syria is a small country with great kitchen
Sentence2: ITMO is a descent university, with great professors
Predicted label = 0
  Correct label = 0


In [30]:
example = raw_test_data[0]
test_and_print_example(example, model, max_seq_length)

Sentence1: PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .
Sentence2: Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .
Predicted label = 1
  Correct label = 1


In [31]:
example = raw_test_data[425]
test_and_print_example(example, model, max_seq_length)

Sentence1: The monkeys could track their progress by watching a schematic representation of the arm and its motions on a video screen .
Sentence2: The arm was kept in a separate room , but the monkeys could track their progress by watching a representation of the arm and its motions on a video screen .
Predicted label = 0
  Correct label = 0


In [32]:
example = raw_test_data[123]
test_and_print_example(example, model, max_seq_length)

Sentence1: I have no doubt whatever that the evidence of Iraqi weapons of mass destruction will be there .
Sentence2: " I have said throughout ... I have absolutely no doubt about the existence of weapons of mass destruction .
Predicted label = 1
  Correct label = 1
