#### Load Libraries

In [1]:
import sys
import torch
from datasets import load_dataset
import numpy as np
from torch.utils.data import RandomSampler, DataLoader, SequentialSampler, TensorDataset, Dataset
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
from colorama import Fore
import pandas as pd
import csv
import os
from datetime import datetime

#### Set accelerator

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)

cuda


#### Set data path

In [3]:
train_src = "./stsbenchmark/sts-train.csv"
val_src   = "./stsbenchmark/sts-dev.csv"
test_src  = "./stsbenchmark/sts-test.csv"
output_dir = './models'
MODEL_FILE= 'model.pkl'
FROM_FILE = False     # Load model from file

In [4]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### Set model params

In [5]:
batch_size = 16           
max_seq_length = 128
lr1 = 1e-4             # learning rate while training the additional layers
lr2 = 2e-6             # learning rate while training the whole model
NUM_EPOCHS_1 = 25      # Number of epochs used to train new layers
NUM_EPOCHS_2 = 50      # Number of epochs used to train the whole new layers
dropout_rate = 0.20

#### Set tokenizer

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
len(tokenizer)

50265

#### Define dataset reader

In [7]:
class FeatureExtractor:
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
    def truncate_pair_of_tokens(self, tokens_a, tokens_b):
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= self.max_len - 3:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()
     
    def build_features(self, example):
        tokens_a = tokenizer.tokenize(example["sentence1"])
        tokens_b = tokenizer.tokenize(example["sentence2"])
        self.truncate_pair_of_tokens(tokens_a, tokens_b)
        tokens = []
        #tokens.append("[CLS]")
        tokens.append(tokenizer.cls_token)
        for token in tokens_a:
            tokens.append(token)
        #tokens.append("[SEP]")
        tokens.append(tokenizer.sep_token) 
        for token in tokens_b:
            tokens.append(token)
        #tokens.append("[SEP]")
        tokens.append(tokenizer.sep_token)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_len:
            input_ids.append(0)
            input_mask.append(0)

        input_ids   = torch.tensor(input_ids, dtype=torch.int64).to(device)
        input_mask  = torch.tensor(input_mask, dtype=torch.float).to(device)
        return (input_ids, input_mask)       

In [8]:
class STSBenchmark(torch.utils.data.Dataset):
    def __init__(self, dataset_path, max_len, tokenizer):
        self.max_len = max_len 
        self.dataset_path = dataset_path
        self.dataset = self.read_data()
        self.feature_extractor =  FeatureExtractor(tokenizer, max_len)
        
    
    def read_data(self):
        raw_data = pd.read_csv(self.dataset_path ,sep ="\t", names=list('1234567'), quoting=csv.QUOTE_NONE)
        data = [{"sentence1":  raw_data['6'][i], 
                 "sentence2":  raw_data['7'][i], 
                 "similarity": raw_data['5'][i]/5} for i in range(len(raw_data)) ]
        return data
        
    def __getitem__(self, idx):
        example = self.dataset[idx]
        input_ids,input_mask = self.feature_extractor.build_features(example)
        similarity  = torch.tensor(example["similarity"], dtype=torch.float32).to(device)
        return (input_ids, input_mask, similarity)

    def __len__(self):
        return len(self.dataset)

#### Define datasets & Data loaders

In [9]:
train_dataset = STSBenchmark(train_src, max_seq_length, tokenizer)
val_dataset   = STSBenchmark(val_src,   max_seq_length, tokenizer)
test_dataset  = STSBenchmark(test_src,  max_seq_length, tokenizer)

In [10]:
train_data_loader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size)
val_data_loader   = DataLoader(val_dataset,   shuffle=False, batch_size=batch_size)
test_data_loader  = DataLoader(test_dataset,  shuffle=False, batch_size=batch_size)

#### Define Model

In [11]:
class ROBERTAClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(ROBERTAClassifier, self).__init__()        
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.d1      = torch.nn.Dropout(dropout_rate)
        self.l1      = torch.nn.Linear(768, 128)
        self.bn1     = torch.nn.LayerNorm(128)
        self.d2      = torch.nn.Dropout(dropout_rate)
        self.l2      = torch.nn.Linear(128, 1) 
        torch.nn.init.xavier_uniform_(self.l1.weight)
        torch.nn.init.xavier_uniform_(self.l2.weight)
        
    def forward(self, input_ids, attention_mask):
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x    = self.d1(x)
        x    = self.l1(x)
        x    = self.bn1(x)
        x    = torch.nn.Tanh()(x)
        x    = self.d2(x)
        x    = self.l2(x)
        return x

#### Results manager

In [12]:
class ResultsSaver():
    def __init__(self, train_len, val_len, output_dir):
        self.train_losses  = []
        self.val_losses    = []
        self.steps         = []
        self.best_val_loss = float('Inf')
        self.train_len     = train_len
        self.val_len       = val_len
        self.output_dir    = output_dir
          
    def save_checkpoint(self, filename, model, valid_loss):
        torch.save({'model_state_dict': model.state_dict(),'valid_loss': valid_loss}, os.path.join(self.output_dir, filename))

    def load_checkpoint(self, filename, model):    
        state_dict = torch.load(os.path.join(self.output_dir , filename), map_location=device)
        model.load_state_dict(state_dict['model_state_dict'])
        return state_dict['valid_loss']

    def save_metrics(self, filename):   
        state_dict = {'train_losses': self.train_losses,
                      'val_losses': self.val_losses,
                      'steps': self.steps}

        torch.save(state_dict, os.path.join(self.output_dir, filename))
  
    def load_metrics(self, filename):    
        state_dict = torch.load(os.path.join(self.output_dir , filename), map_location=device)
        return state_dict['train_losses'], state_dict['val_losses'], state_dict['steps']

    def update_train_val_loss(self, model, train_loss, val_loss, step, epoch, num_epochs):
        train_loss = train_loss 
        val_loss   = val_loss  
        self.train_losses.append(train_loss)
        self.val_losses.append(val_loss)
        self.steps.append(step)
        print('Epoch [{}/{}], step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
              .format(epoch+1, num_epochs, step, num_epochs * self.train_len, train_loss, val_loss))
    
        # checkpoint
        if self.best_val_loss > val_loss:
            self.best_val_loss = val_loss
            self.save_checkpoint('model.pkl', model, self.best_val_loss)
            self.save_metrics('metric.pkl')

#### Main train method

In [13]:
def train(model, optimizer, train_data_loader, val_data_loader, results, 
          scheduler = None, num_epochs = 5 , train_whole_model = False):
    step = 0
    # if we want to train all the model (our added layers + RoBERTa)
    if train_whole_model:
        for param in model.roberta.parameters():
            param.requires_grad = True
    # in case we just want to train our added layer.
    else:
        for param in model.roberta.parameters():
            param.requires_grad = False
    
    model.train()
    
    for epoch in range(num_epochs):
        train_loss = 0.0                
        val_loss = 0.0
        batch_count = 0
        for (input_ids, input_mask, sim_true) in train_data_loader:
            sim_pred = model(input_ids = input_ids, attention_mask = input_mask)
            loss = torch.nn.MSELoss()(sim_pred, sim_true.unsqueeze(1))
            loss.backward()
            # Optimizer and scheduler step
            optimizer.step()    
            scheduler.step()
            optimizer.zero_grad()
            # Update train loss and step
            train_loss += loss.item()
            step += batch_size
            batch_count+=1
        train_loss /= batch_count
        model.eval()
        with torch.no_grad():      
            batch_count = 0
            for (input_ids, input_mask, sim_true) in val_data_loader:
                sim_pred = model(input_ids = input_ids, attention_mask = input_mask)
                loss = torch.nn.MSELoss()(sim_pred, sim_true.unsqueeze(1) )
                val_loss += loss.item()
                batch_count+=1
            val_loss /= batch_count
        results.update_train_val_loss(model, train_loss, val_loss, step, epoch, num_epochs)       
        model.train()

    results.save_metrics('metric.pkl')

#### Create or load Model

In [14]:
results = ResultsSaver(len(train_dataset), len(val_dataset), output_dir)
steps_per_epoch = len(train_dataset)

In [15]:
if FROM_FILE:
    model = ROBERTAClassifier(dropout_rate)
    results.load_checkpoint(MODEL_FILE, model)
else:    
    model = ROBERTAClassifier(dropout_rate)

model = model.to(device)

#### Train new layers

##### Define optimizer and scheduler

In [16]:
optimizer = AdamW(model.parameters(), lr=lr1)
scheduler = get_linear_schedule_with_warmup(optimizer,  num_warmup_steps=steps_per_epoch * 1, num_training_steps = steps_per_epoch * NUM_EPOCHS_1)

In [17]:
print(f"[{datetime.now()}] -- Training new layers started")
train(model=model, train_data_loader=train_data_loader, val_data_loader=val_data_loader, optimizer=optimizer, 
      results = results, scheduler=scheduler, num_epochs=NUM_EPOCHS_1, train_whole_model = False)
print(f"[{datetime.now()}] -- Training new layers ended")

[2021-05-10 13:31:59.642287] -- Training new layers started
Epoch [1/25], step [5760/143725], Train Loss: 0.6151, Valid Loss: 0.1240
Epoch [2/25], step [11520/143725], Train Loss: 0.3657, Valid Loss: 0.1237
Epoch [3/25], step [17280/143725], Train Loss: 0.3147, Valid Loss: 0.1039
Epoch [4/25], step [23040/143725], Train Loss: 0.2464, Valid Loss: 0.0986
Epoch [5/25], step [28800/143725], Train Loss: 0.1958, Valid Loss: 0.1032
Epoch [6/25], step [34560/143725], Train Loss: 0.1604, Valid Loss: 0.0949
Epoch [7/25], step [40320/143725], Train Loss: 0.1349, Valid Loss: 0.0913
Epoch [8/25], step [46080/143725], Train Loss: 0.1233, Valid Loss: 0.0930
Epoch [9/25], step [51840/143725], Train Loss: 0.1133, Valid Loss: 0.0938
Epoch [10/25], step [57600/143725], Train Loss: 0.1058, Valid Loss: 0.0946
Epoch [11/25], step [63360/143725], Train Loss: 0.1027, Valid Loss: 0.0910
Epoch [12/25], step [69120/143725], Train Loss: 0.0996, Valid Loss: 0.0907
Epoch [13/25], step [74880/143725], Train Loss: 0.

#### Clear GPU cache 

In [18]:
torch.cuda.empty_cache()

#### Train the whole model

##### Define optimizer and scheduler

In [19]:
optimizer = AdamW(model.parameters(), lr=lr2)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=steps_per_epoch*2,  num_training_steps=steps_per_epoch*NUM_EPOCHS_2)

In [20]:
print(f"[{datetime.now()}] -- Training whole layers started")
train(model=model,  train_data_loader=train_data_loader,  val_data_loader=val_data_loader,  optimizer=optimizer, 
      results = results,  scheduler=scheduler,  num_epochs=NUM_EPOCHS_2, train_whole_model=True)
print(f"[{datetime.now()}] -- Training whole layers ended")

[2021-05-10 13:44:31.645050] -- Training whole layers started
Epoch [1/50], step [5760/287450], Train Loss: 0.0855, Valid Loss: 0.0914
Epoch [2/50], step [11520/287450], Train Loss: 0.0843, Valid Loss: 0.0915
Epoch [3/50], step [17280/287450], Train Loss: 0.0818, Valid Loss: 0.0883
Epoch [4/50], step [23040/287450], Train Loss: 0.0719, Valid Loss: 0.0720
Epoch [5/50], step [28800/287450], Train Loss: 0.0522, Valid Loss: 0.0444
Epoch [6/50], step [34560/287450], Train Loss: 0.0398, Valid Loss: 0.0328
Epoch [7/50], step [40320/287450], Train Loss: 0.0329, Valid Loss: 0.0286
Epoch [8/50], step [46080/287450], Train Loss: 0.0286, Valid Loss: 0.0264
Epoch [9/50], step [51840/287450], Train Loss: 0.0275, Valid Loss: 0.0250
Epoch [10/50], step [57600/287450], Train Loss: 0.0253, Valid Loss: 0.0242
Epoch [11/50], step [63360/287450], Train Loss: 0.0241, Valid Loss: 0.0234
Epoch [12/50], step [69120/287450], Train Loss: 0.0226, Valid Loss: 0.0224
Epoch [13/50], step [74880/287450], Train Loss: 

#### Testing different examples

In [21]:
def test_single_example(example, model, max_len):
    feature_extractor = FeatureExtractor(tokenizer, max_len)
    model.eval()
    input_ids,input_mask = feature_extractor.build_features(example)
    input_ids = input_ids.reshape(1, -1)
    input_mask = input_mask.reshape(1, -1)
    result = model(input_ids=input_ids.to(device), attention_mask=input_mask.to(device))
    result = result[0].detach().cpu()
    return result.numpy()[0] * 5 #The model is measuring similarity on a scale from 0 to 1

In [22]:
example = {
        'sentence1': 'It is raining in Syria',
        'sentence2': 'Syria is a great country',
    }

res = test_single_example(example, model, max_seq_length)
print(res)

0.931507870554924


In [23]:
example = {
        'sentence1': 'It is sunny here',
        'sentence2': 'UN chief condemns attack against peacekeepers in Mali',
    }

res = test_single_example(example, model, max_seq_length)
print(res)

0.2767534554004669


In [27]:
example = {
        'sentence1': "A woman is slicing an onion.",
        'sentence2': 'A man is cutting an onion.',
    }

res = test_single_example(example, model, max_seq_length)
print(res)

2.676664888858795


In [28]:
example = {
        'sentence1': "I love rain",
        'sentence2': "It is raining in SPb",
    }

res = test_single_example(example, model, max_seq_length)
print(res)

1.2980884313583374


In [29]:
example = {
        'sentence1': "A woman is slicing an onion.",
        'sentence2': 'A woman is cutting an onion.',
    }

res = test_single_example(example, model, max_seq_length)
print(res)

4.676341414451599


In [30]:
example = {
        'sentence1': "A woman is cutting an onion.",
        'sentence2': 'A woman is cutting an onion.',
    }

res = test_single_example(example, model, max_seq_length)
print(res)

4.878077208995819
