In [None]:
#packages required
# !pip install transformers
# !pip install scikit-learn
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch import nn, optim
import copy
import random
import sklearn.metrics
import tqdm
import pickle
import pandas as pd
import numpy as np
import math

In [None]:
##dataset for training
with open('./processed_data_2_full.pkl', 'rb') as f:
   data = pickle.load(f)

print("Original data has {} question pairs".format(len(data[0])))

In [None]:
##Train & Test split
size = len(data[0])
dataset = data[:][:size]
train_ratio = 0.8
indices = list(range(size))
random.shuffle(indices)
train_indices = indices[:int(size*train_ratio)]
test_indices = indices[int(size*train_ratio):]
train_dataset = [[dataset[i][j] for j in train_indices] for i in range(len(dataset))]
test_dataset = [[dataset[i][j] for j in test_indices] for i in range(len(dataset))]

train_input_1 = [" ".join(train_dataset[0][i]) for i in range(len(train_dataset[0]))]
train_input_2 = [" ".join(train_dataset[1][i]) for i in range(len(train_dataset[1]))]
train_Y = train_dataset[2]
print(len(train_input_1), len(train_input_2), len(train_Y))
num_classes = 2

test_input_1 = [" ".join(test_dataset[0][i]) for i in range(len(test_dataset[0]))]
test_input_2 = [" ".join(test_dataset[1][i]) for i in range(len(test_dataset[1]))]
test_Y = test_dataset[2]
print(len(test_input_1), len(test_input_2), len(test_Y))
num_classes = 2

In [None]:
#Random shuffle the dataset
def shuffle_data(input_1, input_2, labels):
    shuffled_input_1 = []
    shuffled_input_2 = []
    shuffled_labels = []
    indices = list(range(len(input_1)))
    random.shuffle(indices)
    for i in indices:
        shuffled_input_1.append(input_1[i])
        shuffled_input_2.append(input_2[i])
        shuffled_labels.append(labels[i])
    return (shuffled_input_1, shuffled_input_2, shuffled_labels)


In [None]:
#model architecture for checking question similarity
##  It leverages the DistilBERT model for feature extraction, followed by an additional MLP (Multilayer Perceptron) and a series of transformations
## The [CLS] token in model like DistilBERT or BERT is specifically designed to capture the overall representation of the entire input sequence. In tasks like question similarity, this token's representation is used to compare the two input sequences and make predictions about their similarity.
## MLP (Multilayer Perceptron) Block
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.c_fc = nn.Linear(768*2, 2 * 2 * 768)
        self.prelu = nn.PReLU()
        self.c_proj = nn.Linear(2 * 2 * 768, 768*2)
    def forward(self, x):
        x = self.c_fc(x)
        x = self.prelu(x)
        x = self.c_proj(x)
        return x

## Residual Block with Layer Normalization
class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.ln_1 = nn.LayerNorm(768*2)
        self.mlp = MLP()
    def forward(self, x):
        x = x + self.mlp(self.ln_1(x))
        return x

# Main Model
class SimilarityModelFineTuneBert(nn.Module):
    def __init__(self, dropout_rate=0.25):
        super(SimilarityModelFineTuneBert, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(p=dropout_rate)
        self.after_tf = nn.ModuleList([Block() for _ in range(2)])
        self.feedforward_1 = nn.Linear(768*2, 768)
        self.non_lin_1 = nn.PReLU()
        self.feedforward_2 = nn.Linear(768, 300)
        self.non_lin_2 = nn.PReLU()
        self.feedforward_3 = nn.Linear(300, 2)
        self.dropout = nn.Dropout(p=dropout_rate)
        
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, encoded_input_1, encoded_input_2, train=False):
        if train:
            self.train()
        else:
            self.eval()
            
        device = next(self.parameters()).device
        
        ## extracting the [CLS] token representation (first token in the output sequence).
        pooler_output_1 = self.bert(encoded_input_1['input_ids'].to(device), encoded_input_1['attention_mask'].to(device)).last_hidden_state[:, 0]
        pooler_output_2 = self.bert(encoded_input_2['input_ids'].to(device), encoded_input_2['attention_mask'].to(device)).last_hidden_state[:, 0]
        
        concatenated_output = torch.cat([pooler_output_1, pooler_output_2], dim=1)
        
        for block in self.after_tf:
            concatenated_output = block(concatenated_output)
        
        concatenated_output = self.dropout(self.non_lin_1(self.feedforward_1(concatenated_output)))
        concatenated_output = self.dropout(self.non_lin_2(self.feedforward_2(concatenated_output)))

        return self.log_softmax(self.feedforward_3(concatenated_output))

    @torch.no_grad()
    def get_predictions(self, X_1, X_2, tokenizer, batch_size=8):
        device = next(self.parameters()).device
        all_predictions = []
        for batch in tqdm.notebook.tqdm(range(0, len(X_1), batch_size), leave=False):
            encoded_input_1 = tokenizer(X_1[batch:batch + batch_size], return_tensors='pt', padding=True, truncation=True)
            encoded_input_2 = tokenizer(X_2[batch:batch + batch_size], return_tensors='pt', padding=True, truncation=True)
            encoded_input_1 = {k: v.to(device) for k, v in encoded_input_1.items()}
            encoded_input_2 = {k: v.to(device) for k, v in encoded_input_2.items()}
            log_probs = self.forward(encoded_input_1, encoded_input_2, train=False)
            prediction_batch = torch.argmax(log_probs, dim=1)
            all_predictions.extend(prediction_batch.cpu().numpy())
        return np.array(all_predictions)

In [None]:
## Train function
def train(model, tokenizer, X_1, X_2, Y, learning_rate=0.01, batch_size=8, num_epochs=5, accumulation_steps=4):
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    num_classes = len(set(Y))
    device = next(model.parameters()).device
    
    for epoch in range(num_epochs):
        total_loss = 0.0
        shuffled_input_1, shuffled_input_2, shuffled_labels = shuffle_data(X_1, X_2, Y)
        model.train()
        optimizer.zero_grad()
        
        for i in tqdm.notebook.tqdm(range(0, len(X_1), batch_size), leave=False):
            input_1 = shuffled_input_1[i:i + batch_size]
            input_2 = shuffled_input_2[i:i + batch_size]
            encoded_input_1 = tokenizer(input_1, return_tensors='pt', padding=True, truncation=True)
            encoded_input_2 = tokenizer(input_2, return_tensors='pt', padding=True, truncation=True)
            
            encoded_input_1 = {k: v.to(device) for k, v in encoded_input_1.items()}
            encoded_input_2 = {k: v.to(device) for k, v in encoded_input_2.items()}
            labels = torch.tensor(shuffled_labels[i:i + batch_size]).to(device)
            
            log_probs = model(encoded_input_1, encoded_input_2, train=True)
            loss = criterion(log_probs, labels)
            loss = loss / accumulation_steps
            loss.backward()
            
            if (i // batch_size + 1) % accumulation_steps == 0:
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()
            
            total_loss += loss.item() * accumulation_steps
#             print(f"Micro Epoch {i + 1}, Average Loss: {total_loss:.4f/num_batches}")
        
        # Handle any remaining gradient
        if (i + batch_size) % (batch_size * accumulation_steps) == 0 or (i + batch_size) >= len(X_1):
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()
        
        num_batches = math.ceil(len(X_1) / batch_size)
        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
    
    print("Training completed.")

In [None]:
def evaluate(Y, predictions):
  print("Accuracy: {}".format(sklearn.metrics.accuracy_score(Y, predictions)))
  print("F1 score: {}".format(sklearn.metrics.f1_score(Y, predictions)))
  print("Precision: {}".format(sklearn.metrics.precision_score(Y, predictions)))
  print("Recall: {}".format(sklearn.metrics.recall_score(Y, predictions)))
  print("Confusion matrix: \n{}\n".format(sklearn.metrics.confusion_matrix(Y, predictions)))

In [None]:
## Training parameters
dropout_rate = 0.15
batch_size = 8
learning_rate = 3e-4
num_epochs = 1
accumulation_steps = 8

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
print("Training fine tune model")
fine_tune_model = SimilarityModelFineTuneBert()

In [None]:
## Model parameter count
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameters(fine_tune_model)

In [None]:
train(fine_tune_model, tokenizer, train_input_1[:10000], train_input_2[:10000], train_Y[:10000], learning_rate=learning_rate, num_epochs=num_epochs, batch_size=batch_size)

In [None]:
%%time
fine_tune_model.get_predictions(["what is the earning of Google in 2024?"], ["what is the revenue of Google in 2024?"],tokenizer, batch_size=batch_size)