In [None]:
## Importing libraries
import pandas as pd
import torch
from torch.utils.data import DataLoader,TensorDataset
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer,BertForSequenceClassification,AdamW
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Loading the training data
data=pd.read_csv('/kaggle/input/evalbot/training.csv')

In [None]:
## Dimensions of data
data.shape

(99887, 3)

In [None]:
## First 5 rows of data
data.head()

Unnamed: 0,similarity,sentence1,sentence2
0,1,person horse jumps broken airplane,person training horse competition
1,0,person horse jumps broken airplane,person diner ordering omelette
2,2,person horse jumps broken airplane,person outdoors horse
3,1,children smiling waving camera,smiling parents
4,2,children smiling waving camera,children present


In [None]:
## Basic information of data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99887 entries, 0 to 99886
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   similarity  99887 non-null  int64 
 1   sentence1   99887 non-null  object
 2   sentence2   99884 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [None]:
## Drop null values
data=data.dropna()

In [None]:
## Check for null values
data.isnull().sum()

similarity    0
sentence1     0
sentence2     0
dtype: int64

In [None]:
## Forming pairs of sentences
training_data=[(row['sentence1'],row['sentence2']) for index,row in data.iterrows()]

In [None]:
## Length of training pairs
len(training_data)

99884

In [22]:
training_data[0]

('person horse jumps broken airplane', 'person training horse competition')

In [None]:
## Load BERT Tokenizer and Model
model_name='bert-base-uncased'
tokenizer=BertTokenizer.from_pretrained(model_name)
model=BertForSequenceClassification.from_pretrained(model_name,num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
## Tokenization
def tokenization(sent1,sent2):
  encoded=tokenizer.encode_plus(
      sent1,sent2,
      add_special_tokens=True,
      padding=True,
      truncation=True,
      return_tensors='pt'
  )
  input_ids=encoded['input_ids']
  attention_masks=encoded['attention_mask']
  return input_ids,attention_masks

input_ids=[]
attention_masks=[]
for sent1,sent2 in training_data:
  ids,masks=tokenization(sent1,sent2)
  input_ids.append(ids[0])
  attention_masks.append(masks[0])

input_ids=pad_sequence(input_ids,batch_first=True)
attention_masks=pad_sequence(attention_masks,batch_first=True)
similarity_tensor=torch.tensor(data['similarity'].values)
dataset=TensorDataset(input_ids,attention_masks,similarity_tensor)
dataloader=DataLoader(dataset,batch_size=32)

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
## Initialize hyper-parameters
optimizer1=torch.optim.AdamW(model.parameters(),lr=1e-5)
optimizer2=torch.optim.AdamW(model.parameters(),lr=2e-5)
optimizer3=torch.optim.AdamW(model.parameters(),lr=5e-5)
epochs=10

In [None]:
## Fine-tuning of model with optimizer 1
for epoch in range(epochs):
  model.train()
  total_loss=0
  for batch in dataloader:
    input_ids,attention_masks,labels=batch
    input_ids,attention_masks,labels=input_ids.to(device),attention_masks.to(device),labels.to(device)
    outputs=model(input_ids,attention_mask=attention_masks,labels=labels)
    loss=outputs.loss
    total_loss+=loss.item()
    optimizer1.zero_grad()
    loss.backward()
    optimizer1.step()
  loss=total_loss/len(dataloader)
  print('Epoch : ',epoch+1,'----> Loss : ',loss)

Epoch :  1 ----> Loss :  0.6337909205153842
Epoch :  2 ----> Loss :  0.4555837016368928
Epoch :  3 ----> Loss :  0.3649554422565525
Epoch :  4 ----> Loss :  0.2931244809641088
Epoch :  5 ----> Loss :  0.2363479535860856
Epoch :  6 ----> Loss :  0.1943885262740548
Epoch :  7 ----> Loss :  0.16114755553264387
Epoch :  8 ----> Loss :  0.13830374421379998
Epoch :  9 ----> Loss :  0.11543103792238682
Epoch :  10 ----> Loss :  0.10421650026746519


In [None]:
## Save fine-tuned model
model.save_pretrained('/kaggle/working/fine-tuned-bert1')

In [None]:
## Fine-tuning of model with optimizer 2
for epoch in range(epochs):
  model.train()
  total_loss=0
  for batch in dataloader:
    input_ids,attention_masks,labels=batch
    input_ids,attention_masks,labels=input_ids.to(device),attention_masks.to(device),labels.to(device)
    outputs=model(input_ids,attention_mask=attention_masks,labels=labels)
    loss=outputs.loss
    total_loss+=loss.item()
    optimizer2.zero_grad()
    loss.backward()
    optimizer2.step()
  loss=total_loss/len(dataloader)
  print('Epoch : ',epoch+1,'----> Loss : ',loss)

Epoch :  1 ----> Loss :  0.5814655830535959
Epoch :  2 ----> Loss :  0.4049869488559611
Epoch :  3 ----> Loss :  0.2976359235913917
Epoch :  4 ----> Loss :  0.2200380824515232
Epoch :  5 ----> Loss :  0.17517459309170075
Epoch :  6 ----> Loss :  0.14198081307682942
Epoch :  7 ----> Loss :  0.11953141586154026
Epoch :  8 ----> Loss :  0.09930353646065496
Epoch :  9 ----> Loss :  0.08608571486547589
Epoch :  10 ----> Loss :  0.07473612968541574


In [None]:
## Save fine-tuned model
model.save_pretrained('/kaggle/working/fine-tuned-bert2')

In [None]:
## Fine-tuning of model with optimizer 3
for epoch in range(epochs):
  model.train()
  total_loss=0
  for batch in dataloader:
    input_ids,attention_masks,labels=batch
    input_ids,attention_masks,labels=input_ids.to(device),attention_masks.to(device),labels.to(device)
    outputs=model(input_ids,attention_mask=attention_masks,labels=labels)
    loss=outputs.loss
    total_loss+=loss.item()
    optimizer3.zero_grad()
    loss.backward()
    optimizer3.step()
  loss=total_loss/len(dataloader)
  print('Epoch : ',epoch+1,'----> Loss : ',loss)

Epoch :  1 ----> Loss :  0.5851726678464368
Epoch :  2 ----> Loss :  0.40096190740986176
Epoch :  3 ----> Loss :  0.2900092475400244
Epoch :  4 ----> Loss :  0.22516892129354882
Epoch :  5 ----> Loss :  0.18167134091791426
Epoch :  6 ----> Loss :  0.15366139025400552
Epoch :  7 ----> Loss :  0.13023175942710935
Epoch :  8 ----> Loss :  0.11334886096340642
Epoch :  9 ----> Loss :  0.10210506791833007
Epoch :  10 ----> Loss :  0.08756085024920783


In [None]:
## Save fine-tuned model
model.save_pretrained('/kaggle/working/fine-tuned-bert3')