#### Load Libraries

In [1]:
import sys
import torch
from datasets import load_dataset
import numpy as np
from torch.utils.data import RandomSampler, DataLoader, SequentialSampler, TensorDataset, Dataset
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
from colorama import Fore
import pandas as pd
import csv
import os
from datetime import datetime

#### Set accelerator

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)

cuda


#### Set data path

In [3]:
output_dir = './models'
pretrained_STS= 'pretrained_STS.pkl'
fineTunedModel = 'model.pkl'
FROM_FILE = False

In [4]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### Set model params

In [5]:
batch_size = 16           
max_seq_length = 128
lr1 = 1e-3             # learning rate while training the additional layers
lr2 = 1e-6             # learning rate while training the whole model
NUM_EPOCHS_1 = 20      # Number of epochs used to train new layers
NUM_EPOCHS_2 = 40      # Number of epochs used to train the whole new layers
dropout_rate = 0.35

#### Set tokenizer

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
len(tokenizer)

50265

#### Define dataset reader

In [7]:
class glueDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, max_len):
        self.max_len = max_len 
        self.dataset = dataset
    
    def _truncate_pair_of_tokens(self, tokens_a, tokens_b, ):
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= self.max_len - 3:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def __getitem__(self, idx):
        example = self.dataset[idx]
        tokens_a = tokenizer.tokenize(example["sentence1"])
        tokens_b = tokenizer.tokenize(example["sentence2"])
        self._truncate_pair_of_tokens(tokens_a, tokens_b)
        tokens = []
        #tokens.append("[CLS]")
        tokens.append(tokenizer.cls_token)
        for token in tokens_a:
            tokens.append(token)
        #tokens.append("[SEP]")
        tokens.append(tokenizer.sep_token) 
        for token in tokens_b:
            tokens.append(token)
        #tokens.append("[SEP]")
        tokens.append(tokenizer.sep_token)
      
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_len:
            input_ids.append(0)
            input_mask.append(0)
      
        input_ids   = torch.tensor(input_ids, dtype=torch.int64).to(device)
        input_mask  = torch.tensor(input_mask, dtype=torch.float).to(device)
        label       = torch.tensor(example["label"], dtype=torch.int64).to(device)
        return (input_ids, input_mask, label)

    def __len__(self):
        return len(self.dataset)

#### Define datasets & Data loaders

In [8]:
raw_train_data = load_dataset('glue', 'mrpc', split='train')
raw_val_data   = load_dataset('glue', 'mrpc', split='validation')
raw_test_data  = load_dataset('glue', 'mrpc', split='test')

Reusing dataset glue (C:\Users\alkha\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (C:\Users\alkha\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (C:\Users\alkha\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [9]:
train_dataset = glueDataset(raw_train_data, max_seq_length)
val_dataset   = glueDataset(raw_val_data,   max_seq_length)

In [10]:
train_data_loader = DataLoader(train_dataset, shuffle=True,  batch_size=batch_size)
val_data_loader   = DataLoader(val_dataset,   shuffle=False, batch_size=batch_size)

#### Results manager

In [11]:
class ResultsSaver():
    def __init__(self, train_len, val_len, output_dir):
        self.train_losses  = []
        self.val_losses    = []
        self.steps         = []
        self.best_val_loss = float('Inf')
        self.train_len     = train_len
        self.val_len       = val_len
        self.output_dir    = output_dir
          
    def save_checkpoint(self, filename, model, valid_loss):
        torch.save({'model_state_dict': model.state_dict(),'valid_loss': valid_loss}, os.path.join(self.output_dir, filename))

    def load_checkpoint(self, filename, model):    
        state_dict = torch.load(os.path.join(self.output_dir , filename), map_location=device)
        model.load_state_dict(state_dict['model_state_dict'])
        return state_dict['valid_loss']

    def save_metrics(self, filename):   
        state_dict = {'train_losses': self.train_losses,
                      'val_losses': self.val_losses,
                      'steps': self.steps}

        torch.save(state_dict, os.path.join(self.output_dir, filename))
  
    def load_metrics(self, filename):    
        state_dict = torch.load(os.path.join(self.output_dir , filename), map_location=device)
        return state_dict['train_losses'], state_dict['val_losses'], state_dict['steps']

    def update_train_val_loss(self, model, train_loss, val_loss, step, epoch, num_epochs):
        train_loss = train_loss 
        val_loss   = val_loss  
        self.train_losses.append(train_loss)
        self.val_losses.append(val_loss)
        self.steps.append(step)
        print('Epoch [{}/{}], step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
              .format(epoch+1, num_epochs, step, num_epochs * self.train_len, train_loss, val_loss))
    
        # checkpoint
        if self.best_val_loss > val_loss:
            self.best_val_loss = val_loss
            self.save_checkpoint('FineTuned_model.pkl', model, self.best_val_loss)
            self.save_metrics('FineTuned_metric.pkl')
            
results = ResultsSaver(len(train_dataset), len(val_dataset), output_dir)

#### Define Model

##### predefined part - Just remove last layer

In [12]:
class ROBERTAClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(ROBERTAClassifier, self).__init__()        
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.d1      = torch.nn.Dropout(dropout_rate)
        self.l1      = torch.nn.Linear(768, 128)
        self.bn1     = torch.nn.LayerNorm(128)
        self.d2      = torch.nn.Dropout(dropout_rate)
        self.l2      = torch.nn.Linear(128, 1) 
        torch.nn.init.xavier_uniform_(self.l1.weight)
        torch.nn.init.xavier_uniform_(self.l2.weight)
        
    def forward(self, input_ids, attention_mask):
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x    = self.d1(x)
        x    = self.l1(x)
        x    = self.bn1(x)
        x    = torch.nn.Tanh()(x)
        x    = self.d2(x)
        return x

##### Fine-Tuned model

In [13]:
class ROBERTAClassifier_2(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(ROBERTAClassifier_2, self).__init__()   
        self.part_model = ROBERTAClassifier(dropout_rate)
        results.load_checkpoint(pretrained_STS, self.part_model)
        self.l2 = torch.nn.Linear(128, 2)
        self.l3 = torch.nn.Softmax(1)
        torch.nn.init.xavier_uniform_(self.l2.weight)
        
    def forward(self, input_ids, attention_mask):
        x    = self.part_model(input_ids=input_ids, attention_mask=attention_mask)
        x    = self.l2(x)    
        x    = self.l3(x)
        return x

#### Main train method

In [14]:
model = ROBERTAClassifier_2(dropout_rate)
model.to(device)
print("model on", device)

model on cuda


In [15]:
def train(model, optimizer, train_data_loader, val_data_loader, results, 
          scheduler = None, num_epochs = 5 , train_whole_model = False):
    step = 0
    # if we want to train all the model (our added layers + RoBERTa)
    if train_whole_model:
        for param in model.part_model.parameters():
            param.requires_grad = True
    # in case we just want to train our added layer.
    else:
        for param in model.part_model.parameters():
            param.requires_grad = False
    
    model.train()
    
    for epoch in range(num_epochs):
        train_loss  = 0.0                
        val_loss    = 0.0
        batch_count = 0
        for (input_ids, input_mask, y_true) in train_data_loader:
            y_pred = model(input_ids = input_ids, attention_mask = input_mask) 
            loss = torch.nn.CrossEntropyLoss()(y_pred, y_true)
            loss.backward()
            # Optimizer and scheduler step
            optimizer.step()    
            scheduler.step()
            optimizer.zero_grad()
            # Update train loss and step
            train_loss += loss.item()
            step += batch_size
            batch_count+=1
        train_loss /= batch_count
        model.eval()
        with torch.no_grad():      
            batch_count = 0
            for (input_ids, input_mask, y_true) in val_data_loader:
                y_pred = model(input_ids = input_ids, attention_mask = input_mask) 
                loss = torch.nn.CrossEntropyLoss()(y_pred, y_true)
                val_loss += loss.item()
                batch_count+=1
            val_loss /= batch_count
        results.update_train_val_loss(model, train_loss, val_loss, step, epoch, num_epochs)       
        model.train()

    results.save_metrics('FineTuned_metric.pkl')

#### Create or load Model

In [16]:
results = ResultsSaver(len(train_dataset), len(val_dataset), output_dir)
steps_per_epoch = len(train_dataset)

In [17]:
if FROM_FILE:
    model = ROBERTAClassifier_2(dropout_rate)
    results.load_checkpoint(fineTunedModel, model)
else:    
    model = ROBERTAClassifier_2(dropout_rate)

model = model.to(device)

#### Train new layers

##### Define optimizer and scheduler

In [18]:
optimizer = AdamW(model.parameters(), lr=lr1)
scheduler = get_linear_schedule_with_warmup(optimizer,  num_warmup_steps=steps_per_epoch * 1, num_training_steps = steps_per_epoch * NUM_EPOCHS_1)

In [19]:
print(f"[{datetime.now()}] -- Training new layers started")
train(model=model, train_data_loader=train_data_loader, val_data_loader=val_data_loader, optimizer=optimizer, 
      results = results, scheduler=scheduler, num_epochs=NUM_EPOCHS_1, train_whole_model = False)
print(f"[{datetime.now()}] -- Training new layers ended")

[2021-06-10 09:06:36.623575] -- Training new layers started
Epoch [1/20], step [3680/73360], Train Loss: 0.7447, Valid Loss: 0.7146
Epoch [2/20], step [7360/73360], Train Loss: 0.6908, Valid Loss: 0.6461
Epoch [3/20], step [11040/73360], Train Loss: 0.6448, Valid Loss: 0.6148
Epoch [4/20], step [14720/73360], Train Loss: 0.6235, Valid Loss: 0.6091
Epoch [5/20], step [18400/73360], Train Loss: 0.6194, Valid Loss: 0.6072
Epoch [6/20], step [22080/73360], Train Loss: 0.6163, Valid Loss: 0.6051
Epoch [7/20], step [25760/73360], Train Loss: 0.6144, Valid Loss: 0.6023
Epoch [8/20], step [29440/73360], Train Loss: 0.6118, Valid Loss: 0.5977
Epoch [9/20], step [33120/73360], Train Loss: 0.6076, Valid Loss: 0.5918
Epoch [10/20], step [36800/73360], Train Loss: 0.6006, Valid Loss: 0.5855
Epoch [11/20], step [40480/73360], Train Loss: 0.5965, Valid Loss: 0.5805
Epoch [12/20], step [44160/73360], Train Loss: 0.5904, Valid Loss: 0.5744
Epoch [13/20], step [47840/73360], Train Loss: 0.5876, Valid Lo

#### Clear GPU cache 

In [20]:
torch.cuda.empty_cache()

#### Train the whole model

##### Define optimizer and scheduler

In [21]:
optimizer = AdamW(model.parameters(), lr=lr2)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=steps_per_epoch*2,  num_training_steps=steps_per_epoch*NUM_EPOCHS_2)

In [22]:
print(f"[{datetime.now()}] -- Training whole layers started")
train(model=model,  train_data_loader=train_data_loader,  val_data_loader=val_data_loader,  optimizer=optimizer, 
      results = results,  scheduler=scheduler,  num_epochs=NUM_EPOCHS_2, train_whole_model=True)
print(f"[{datet ime.now()}] -- Training whole layers ended")

[2021-06-10 09:12:28.073447] -- Training whole layers started
Epoch [1/40], step [3680/146720], Train Loss: 0.5645, Valid Loss: 0.5438
Epoch [2/40], step [7360/146720], Train Loss: 0.5581, Valid Loss: 0.5400
Epoch [3/40], step [11040/146720], Train Loss: 0.5548, Valid Loss: 0.5351
Epoch [4/40], step [14720/146720], Train Loss: 0.5443, Valid Loss: 0.5267
Epoch [5/40], step [18400/146720], Train Loss: 0.5363, Valid Loss: 0.5172
Epoch [6/40], step [22080/146720], Train Loss: 0.5290, Valid Loss: 0.5060
Epoch [7/40], step [25760/146720], Train Loss: 0.5165, Valid Loss: 0.4948
Epoch [8/40], step [29440/146720], Train Loss: 0.4995, Valid Loss: 0.4785
Epoch [9/40], step [33120/146720], Train Loss: 0.4902, Valid Loss: 0.4700
Epoch [10/40], step [36800/146720], Train Loss: 0.4782, Valid Loss: 0.4545
Epoch [11/40], step [40480/146720], Train Loss: 0.4644, Valid Loss: 0.4496
Epoch [12/40], step [44160/146720], Train Loss: 0.4581, Valid Loss: 0.4436
Epoch [13/40], step [47840/146720], Train Loss: 0

#### Test model

In [23]:
def truncate_pair_of_tokens(tokens_a, tokens_b, max_len):
      while True:
          total_length = len(tokens_a) + len(tokens_b)
          if total_length <= max_len - 3:
              break
          if len(tokens_a) > len(tokens_b):
              tokens_a.pop()
          else:
              tokens_b.pop()

def build_features(example, max_len = 128):
  tokens_a = tokenizer.tokenize(example["sentence1"])
  tokens_b = tokenizer.tokenize(example["sentence2"])
  truncate_pair_of_tokens(tokens_a, tokens_b, max_len)
  tokens = []
  #tokens.append("[CLS]")
  tokens.append(tokenizer.cls_token)
  for token in tokens_a:
      tokens.append(token)
  #tokens.append("[SEP]")
  tokens.append(tokenizer.sep_token) 
  for token in tokens_b:
      tokens.append(token)
  #tokens.append("[SEP]")
  tokens.append(tokenizer.sep_token)
  
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
  input_mask = [1] * len(input_ids)
  while len(input_ids) < max_len:
      input_ids.append(0)
      input_mask.append(0)
  
  input_ids   = torch.tensor(input_ids, dtype=torch.int64).to(device)
  input_mask  = torch.tensor(input_mask, dtype=torch.float).to(device)
  return (input_ids, input_mask)

In [24]:
data = raw_test_data

In [25]:
final = 0
for i in range(len(data)):
  example = data[i]
  input_word_ids_test, input_masks_test = build_features(example)
  input_word_ids_test = input_word_ids_test.reshape(1, -1)
  input_masks_test = input_masks_test.reshape(1, -1)
  result = model(input_ids=input_word_ids_test.to(device), attention_mask=input_masks_test.to(device))
  result = result[0].detach().cpu()
  result = torch.argmax(result).numpy()
  if example['label'] == result:
    final += 1

print(final/len(data))

0.8753623188405797


#### Testing different examples

In [26]:
example = {
      'sentence1': "Syria is a small country with great kitchen",
      'sentence2': "Syria is a beautiful country with delicious kitchen",
}
input_word_ids_test, input_masks_test = build_features(example)
input_word_ids_test = input_word_ids_test.reshape(1, -1)
input_masks_test = input_masks_test.reshape(1, -1)
result = model(input_ids=input_word_ids_test.to(device), attention_mask=input_masks_test.to(device))
result = result[0].detach().cpu()
print(result)
result = torch.argmax(result).numpy()
result

tensor([7.9208e-04, 9.9921e-01])


array(1, dtype=int64)

In [27]:
example = {
      'sentence1': "Syria is a small country with great kitchen",
      'sentence2': 'Itmo is a descent university, with great it experts'
}
input_word_ids_test, input_masks_test = build_features(example)
input_word_ids_test = input_word_ids_test.reshape(1, -1)
input_masks_test = input_masks_test.reshape(1, -1)
result = model(input_ids=input_word_ids_test.to(device), attention_mask=input_masks_test.to(device))
result = result[0].detach().cpu()
print(result)
result = torch.argmax(result).numpy()
result

tensor([1.0000e+00, 1.9491e-07])


array(0, dtype=int64)

In [28]:
example = raw_test_data[0]
print(example['sentence1'],'\n' ,example['sentence2'], '\n', example['label'])
input_word_ids_test, input_masks_test = build_features(example)
input_word_ids_test = input_word_ids_test.reshape(1, -1)
input_masks_test = input_masks_test.reshape(1, -1)
result = model(input_ids=input_word_ids_test.to(device), attention_mask=input_masks_test.to(device))
result = result[0].detach().cpu()
print(result)
result = torch.argmax(result).numpy()
result

PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So . 
 Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So . 
 1
tensor([1.8862e-09, 1.0000e+00])


array(1, dtype=int64)

In [29]:
example = raw_test_data[425]
print(example['sentence1'],'\n' ,example['sentence2'], '\n', example['label'])
input_word_ids_test, input_masks_test = build_features(example)
input_word_ids_test = input_word_ids_test.reshape(1, -1)
input_masks_test = input_masks_test.reshape(1, -1)
result = model(input_ids=input_word_ids_test.to(device), attention_mask=input_masks_test.to(device))
result = result[0].detach().cpu()
print(result)
result = torch.argmax(result).numpy()
result

The monkeys could track their progress by watching a schematic representation of the arm and its motions on a video screen . 
 The arm was kept in a separate room , but the monkeys could track their progress by watching a representation of the arm and its motions on a video screen . 
 0
tensor([1.0000e+00, 1.6814e-08])


array(0, dtype=int64)