#### Installing necessary packages

In [1]:
%%capture
!pip install datasets

In [2]:
%%capture
!pip install transformers

In [4]:
%%capture
!pip install colorama

#### Importing Libraries

In [5]:
import sys
import os
import torch
from datasets import load_dataset
import numpy as np
import pandas as pd
from torch.utils.data import RandomSampler, DataLoader, SequentialSampler, TensorDataset
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from colorama import Fore

##### Setting Model Specifications

In [6]:
gpu = torch.device('cuda')
batch_size = 16
max_seq_length = 128
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

##### Tokenization and Encoding of Inputs as required by BERT

In [7]:
def truncate_seq_pair(tokens_a, tokens_b, max_length):
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


def convert_examples_to_features(data, desc):
    labels = []
    input_word_ids = []
    input_type_ids = []
    input_masks = []
    examples=data.to_dict(orient='records')
    p_bar = tqdm(total=len(examples), desc=desc,
                 position=0, leave=True,
                 file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.BLUE, Fore.RESET))

    for index in range(len(examples)):
        #print(index)
        if "label" in examples[index]:
            labels.append(examples[index]["label"])

        tokens_a = tokenizer.tokenize(examples[index]["sentence1"])
        tokens_b = tokenizer.tokenize(examples[index]["sentence2"])
        truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
        input_word_ids.append(input_ids)
        input_type_ids.append(segment_ids)
        input_masks.append(input_mask)
        p_bar.update(1)
    p_bar.close()
    return [torch.tensor(input_word_ids, dtype=torch.int64),
            torch.tensor(input_masks, dtype=torch.float),
            torch.tensor(input_type_ids, dtype=torch.int64),
            torch.tensor(labels, dtype=torch.int64)]


##### Model Training Function

In [8]:
def train(epochs,model,train_data_loader,validation_data_loader,train_length,validation_length):
  for epoch in range(1, epochs + 1):
    # ============================================ TRAINING ============================================================
    print("Training epoch ", str(epoch))
    training_pbar = tqdm(total=train_length,
                         position=0, leave=True,
                         file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET))
    model.train()
    tr_loss = 0
    nb_tr_steps = 0
    for step, batch in enumerate(train_data_loader):
        batch = tuple(t.to(gpu) for t in batch)
        input_word_ids, input_mask, input_type_ids, labels = batch
        optimizer.zero_grad()
        loss, _ = model(input_ids=input_word_ids,
                        attention_mask=input_mask,
                        token_type_ids=input_type_ids,
                        labels=labels, return_dict=False)
        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        nb_tr_steps += 1
        training_pbar.update(input_word_ids.size(0))
    training_pbar.close()
    print(f"\nTraining loss={tr_loss / nb_tr_steps:.4f}")
    torch.save(model.state_dict(), "./weights_" + str(epoch) + ".pth")
    
    # ============================================ VALIDATION ==========================================================
    validation_pbar = tqdm(total=validation_length,
                           position=0, leave=True,
                           file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.BLUE, Fore.RESET))
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0
    for batch in validation_data_loader:
        batch = tuple(t.to(gpu) for t in batch)
        input_word_ids, input_mask, input_type_ids, labels = batch
        with torch.no_grad():
            logits = model(input_ids=input_word_ids,
                           attention_mask=input_mask,
                           token_type_ids=input_type_ids, return_dict=False)

        logits = logits[0].detach().cpu().numpy()
        label_ids = labels.cpu().numpy()
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        eval_accuracy += np.sum(pred_flat == labels_flat) / len(labels_flat)
        nb_eval_steps += 1
        validation_pbar.update(input_word_ids.size(0))
    validation_pbar.close()
    print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
  return model

#####Testing Function

In [9]:
def test_accuracy(model,test_data_loader):
  model.eval()
  test_accuracy = 0
  nb_test_steps = 0
  for batch in test_data_loader:
      batch = tuple(t.to(gpu) for t in batch)
      input_word_ids, input_mask, input_type_ids, labels = batch
      with torch.no_grad():
          logits = model(input_ids=input_word_ids,
                          attention_mask=input_mask,
                          token_type_ids=input_type_ids, return_dict=False)

      logits = logits[0].detach().cpu().numpy()
      label_ids = labels.cpu().numpy()
      pred_flat = np.argmax(logits, axis=1).flatten()
      labels_flat = label_ids.flatten()
      test_accuracy += np.sum(pred_flat == labels_flat) / len(labels_flat)
      nb_test_steps += 1
      #test_pbar.update(input_word_ids.size(0))
  #test_pbar.close()
  print("Test Accuracy: {}".format(test_accuracy / nb_test_steps))

#####Initializing the Model

In [10]:
# ================================================ INITIALIZING MODEL ======================================================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to('cuda')
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = torch.optim.Adam(lr=1e-5, betas=(0.9, 0.98), eps=1e-9, params=optimizer_grouped_parameters)
# model.load_state_dict(torch.load("./weights_4.pth"))

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

##### Mounting G-Drive for Storing Trained Models

In [11]:
from google.colab import  drive
drive.mount('/drive')

Mounted at /drive


Training of Model starts :


*   First we will prepare a base BERT model for paraphrase detection by training it on the MRPC Dataset.
*   Next we will evaluate the performance of base model on the SQUAD dataset we generated and subsequently fine-tune the base model on it.
*   Finally we will evaluate the performance of the fine-tuned model on PAWS dataset. It contains lexically highly overlapped paraphrases. We have also generated such examples for the SQUAD Dataset. As a last step we will fine-tune the model on the combined PAWS dataset.



#Preparing the Base Model and training it on MRPC Dataset

In [12]:
#MRPC Dataset
train_data=pd.read_csv("train_mrpc.csv")
test_data=pd.read_csv("test_mrpc.csv")
eval_data=pd.read_csv("val_mrpc.csv")

##### Converting to Pytorch DataLoader

In [13]:
train_data = TensorDataset(*convert_examples_to_features(train_data, "Creating training samples"))
train_sampler = SequentialSampler(train_data)
train_data_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

eval_data = TensorDataset(*convert_examples_to_features(eval_data, "Creating evaluation samples"))
eval_sampler = SequentialSampler(eval_data)
validation_data_loader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size)

test_data = TensorDataset(*convert_examples_to_features(test_data, "Creating test samples"))
test_sampler = SequentialSampler(test_data)
test_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


Creating training samples: 100%|[34m██████████[39m| 3668/3668 [00:05<00:00, 722.42it/s]
Creating evaluation samples: 100%|[34m██████████[39m| 408/408 [00:00<00:00, 701.16it/s]
Creating test samples: 100%|[34m██████████[39m| 1725/1725 [00:03<00:00, 470.14it/s]


#####Base Model fine-tuning on MRPC Dataset

In [14]:
epochs=4
model=train(epochs,model,train_data_loader,validation_data_loader,len(train_data),len(eval_data))

Training epoch  1
100%|[32m██████████[39m| 3668/3668 [02:31<00:00, 24.18it/s]

Training loss=0.5670
100%|[34m██████████[39m| 408/408 [00:05<00:00, 72.85it/s]
Validation Accuracy: 0.8100961538461539
Training epoch  2
100%|[32m██████████[39m| 3668/3668 [02:32<00:00, 24.01it/s]

Training loss=0.4040
100%|[34m██████████[39m| 408/408 [00:05<00:00, 72.61it/s]
Validation Accuracy: 0.8293269230769231
Training epoch  3
100%|[32m██████████[39m| 3668/3668 [02:32<00:00, 24.02it/s]

Training loss=0.2484
100%|[34m██████████[39m| 408/408 [00:05<00:00, 72.59it/s]
Validation Accuracy: 0.8653846153846154
Training epoch  4
100%|[32m██████████[39m| 3668/3668 [02:32<00:00, 24.01it/s]

Training loss=0.1357
100%|[34m██████████[39m| 408/408 [00:05<00:00, 72.33it/s]
Validation Accuracy: 0.8341346153846154


#####Base Model Performance result

In [15]:
test_accuracy(model,test_data_loader)

Test Accuracy: 0.8226495726495726


#####Saving the Base Model

In [16]:
directory = "/drive/My Drive/Dissertation/Bert_MRPC"
tokenizer_directory = "/drive/My Drive/Dissertation/Bert_MRPC/tokenizer"
if not os.path.exists(directory):
    os.makedirs(directory)
if not os.path.exists(tokenizer_directory):
    os.makedirs(tokenizer_directory)

# save vocabulary of the tokenizer
tokenizer.save_pretrained(tokenizer_directory)
# save the model weights and its configuration file
model.save_pretrained(directory)

#Fine-Tuning the Base Model on Augmented SQuAD Datset

#####Loading the model from G-Drive

In [17]:
tokenizer = BertTokenizer.from_pretrained(tokenizer_directory)
model = BertForSequenceClassification.from_pretrained(directory, num_labels=2).to('cuda')

#####Loading the Augmented SQuAD Data

In [18]:
df=pd.read_csv("SQuAD_Augmented.csv")

In [19]:
#Splitting the Augmented SQuAD Data
train_data, eval_data, test_data = np.split(df.sample(frac=1, random_state=42), [int(.7*len(df)), int(.8*len(df))])


In [20]:
#Renaming some columns
train_data.rename(columns={'text':'sentence1','Paraphrase':'sentence2'},inplace=True)
eval_data.rename(columns={'text':'sentence1','Paraphrase':'sentence2'},inplace=True)
test_data.rename(columns={'text':'sentence1','Paraphrase':'sentence2'},inplace=True)

#####Converting to Pytorch DataLoaders

In [21]:
train_data = TensorDataset(*convert_examples_to_features(train_data, "Creating training samples"))
train_sampler = SequentialSampler(train_data)
train_data_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

eval_data = TensorDataset(*convert_examples_to_features(eval_data, "Creating evaluation samples"))
eval_sampler = SequentialSampler(eval_data)
validation_data_loader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size)

test_data = TensorDataset(*convert_examples_to_features(test_data, "Creating test samples"))
test_sampler = SequentialSampler(test_data)
test_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

Creating training samples: 100%|[34m██████████[39m| 5329/5329 [00:05<00:00, 991.29it/s] 
Creating evaluation samples: 100%|[34m██████████[39m| 761/761 [00:00<00:00, 990.81it/s] 
Creating test samples: 100%|[34m██████████[39m| 1523/1523 [00:01<00:00, 1008.12it/s]


#####Evaluating the base model's performance on the SQuAD Dataset before fine-tuning

In [22]:
print("Accuracy of the Base Model on SQuAD before fine-tuning :")
test_accuracy(model,test_data_loader)

Accuracy of the Base Model on SQuAD before fine-tuning :
Test Accuracy: 0.8539496527777778


#####Fine Tuning on the Augmented SQuAD Dataset

In [23]:
epochs=4
model=train(epochs,model,train_data_loader,validation_data_loader,len(train_data),len(eval_data))

Training epoch  1
100%|[32m██████████[39m| 5329/5329 [03:41<00:00, 24.06it/s]

Training loss=0.4342
100%|[34m██████████[39m| 761/761 [00:10<00:00, 72.16it/s]
Validation Accuracy: 0.8664641203703703
Training epoch  2
100%|[32m██████████[39m| 5329/5329 [03:41<00:00, 24.07it/s]

Training loss=0.4402
100%|[34m██████████[39m| 761/761 [00:10<00:00, 72.15it/s]
Validation Accuracy: 0.8664641203703703
Training epoch  3
100%|[32m██████████[39m| 5329/5329 [03:41<00:00, 24.08it/s]

Training loss=0.4378
100%|[34m██████████[39m| 761/761 [00:10<00:00, 72.20it/s]
Validation Accuracy: 0.8664641203703703
Training epoch  4
100%|[32m██████████[39m| 5329/5329 [03:41<00:00, 24.08it/s]

Training loss=0.4328
100%|[34m██████████[39m| 761/761 [00:10<00:00, 72.32it/s]
Validation Accuracy: 0.8664641203703703


#####Evaluating performance of Fine-Tuned Model on SQuAD

In [24]:
print("Test Accuracy after Fine-Tuning :")
test_accuracy(model,test_data_loader)

Test Accuracy after Fine-Tuning :
Test Accuracy: 0.8539496527777778


#####Saving the Model fine-tuned on SQuAD in G-Drive

In [31]:
directory = "/drive/My Drive/Dissertation/Bert_SQuAD"
tokenizer_directory = "/drive/My Drive/Dissertation/Bert_SQuAD/tokenizer"
if not os.path.exists(directory):
    os.makedirs(directory)
if not os.path.exists(tokenizer_directory):
    os.makedirs(tokenizer_directory)

# save vocabulary of the tokenizer
tokenizer.save_pretrained(tokenizer_directory)
# save the model weights and its configuration file
model.save_pretrained(directory)

#Lastly Fine-Tuning the model obtained so far on Augmented PAWS Dataset

#####Loading the fine-tuned model from G-Drive

In [32]:
tokenizer = BertTokenizer.from_pretrained(tokenizer_directory)
model = BertForSequenceClassification.from_pretrained(directory, num_labels=2).to('cuda')

In [33]:
#Loading PAWS Dataset : A Daatset for Paraphrase Indentification
from datasets import load_dataset
train_data = load_dataset('paws', 'labeled_final', split='train')
eval_data = load_dataset('paws', 'labeled_final', split='validation')
test_data = load_dataset('paws', 'labeled_final', split='test')

Reusing dataset paws (/root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338)
Reusing dataset paws (/root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338)
Reusing dataset paws (/root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338)


In [34]:
train_data=pd.DataFrame(train_data)
eval_data=pd.DataFrame(eval_data)
test_data=pd.DataFrame(test_data)

In [35]:
#Loading Augmented SQuAD dataset 
df=pd.read_csv("SQuAD_PAWS.csv")

In [36]:
#Splitting the Augmented SQuAD Dataset into train,val,test
train_spaws, eval_spaws, test_spaws = np.split(df.sample(frac=1, random_state=42), [int(.7*len(df)), int(.8*len(df))])

In [37]:
#Renaming columns
train_spaws.rename(columns={'text':'sentence1','Paraphrase':'sentence2'},inplace=True)
eval_spaws.rename(columns={'text':'sentence1','Paraphrase':'sentence2'},inplace=True)
test_spaws.rename(columns={'text':'sentence1','Paraphrase':'sentence2'},inplace=True)

train_spaws=train_spaws[['id','sentence1','sentence2','label']]
eval_spaws=eval_spaws[['id','sentence1','sentence2','label']]
test_spaws=test_spaws[['id','sentence1','sentence2','label']]


In [38]:
#Merging with PAWS with SQuAD 
train_data=train_data.append(train_spaws)
test_data=test_data.append(test_spaws)
eval_data=eval_data.append(eval_spaws)

#####Converting to PyTorch DataLoaders

In [39]:
train_data = TensorDataset(*convert_examples_to_features(train_data, "Creating training samples"))
#train_sampler = RandomSampler(train_data)
train_sampler = SequentialSampler(train_data)
train_data_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

eval_data = TensorDataset(*convert_examples_to_features(eval_data, "Creating evaluation samples"))
eval_sampler = SequentialSampler(eval_data)
validation_data_loader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size)

test_data = TensorDataset(*convert_examples_to_features(test_data, "Creating test samples"))
test_sampler = SequentialSampler(test_data)
test_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

Creating training samples: 100%|[34m██████████[39m| 52686/52686 [01:20<00:00, 657.87it/s]
Creating evaluation samples: 100%|[34m██████████[39m| 8469/8469 [00:11<00:00, 747.78it/s]
Creating test samples: 100%|[34m██████████[39m| 8939/8939 [00:11<00:00, 767.89it/s]


#####Evaluating Fine-Tuned Model's performance on Augmented PAWS Data before fine-tuning on this

In [40]:
print("Accuracy of the Fine-Tuned Model on Augmented PAWS datset before training on PAWS")
test_accuracy(model,test_data_loader)

Accuracy of the Fine-Tuned Model on Augmented PAWS datset before training on PAWS
Test Accuracy: 0.5020125223613596


#####Fine-Tuning on PAWS Dataset

In [None]:
epochs=3
model=train(epochs,model,train_data_loader,validation_data_loader,len(train_data),len(eval_data))

#####Checking the performance of the fully trained model

In [None]:
print("Accuracy of the Fine-Tuned Model on Augmented PAWS datset after training on Augmented PAWS")
test_accuracy(model,test_data_loader)

#####Saving the final fine-tuned model to G-Drive

In [45]:
directory = "/drive/My Drive/Dissertation/Bert_PAWS"
tokenizer_directory = "/drive/My Drive/Dissertation/Bert_PAWS/tokenizer"
if not os.path.exists(directory):
    os.makedirs(directory)
if not os.path.exists(tokenizer_directory):
    os.makedirs(tokenizer_directory)

# save vocabulary of the tokenizer
tokenizer.save_pretrained(tokenizer_directory)
# save the model weights and its configuration file
model.save_pretrained(directory)