In [1]:
!pip install datasets transformers



In [4]:
import os
import os.path as osp
import nltk
import random
# nltk.download('stopwords')
# nltk.download('punkt')
# from nltk.corpus import stopwords
# english_stopwords = stopwords.words("english")
import numpy as np
import re
import seaborn as sns
sns.set_theme(style="whitegrid")
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
import pandas as pd
import string
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F

import datasets
from datasets import load_dataset
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertModel
import multiprocessing
import time
from torch.utils.data import DataLoader, Dataset 
import sys
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [5]:
test_file = "dev_data.csv"
val_file = "val_data.csv"
train_file = "train_data.csv"

In [6]:
pretrained_model = "bert-base-uncased"
batch_size = 128
max_para_length = 128

# Check if cuda is available and set device
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

# Make sure you choose suitable num_worker, otherwise it will result in errors
num_workers = 8 if cuda else 0

print("Cuda = ", str(cuda), " with num_workers = ", str(num_workers),  " system version = ", sys.version)

Cuda =  True  with num_workers =  8  system version =  3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) 
[GCC 10.3.0]


In [7]:
class ContextEmbeddingDataset(Dataset):
    def __init__(self, csv_file, context, pretrained_model):
      df = pd.read_csv(csv_file)

      self.context = context
      self.tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lower=True)    

      # Tokenize the paragraphs
      self.df = df["para"].apply(self.preprocess)
      self.y = df['label']
  
     
    def preprocess(self, examples):
      return self.tokenizer(examples, truncation=True, 
                     padding="max_length", max_length=max_para_length,
                     return_token_type_ids=False)['input_ids']

    def __len__(self):
      return len(self.y) - (2*self.context)
    
    def __getitem__(self,index):
      return torch.LongTensor(list(self.df[index:(index + 2*self.context+1)])), self.y[index+self.context]
      
      # self.embed_model.eval()
      # Generate BERT embeddings for the tokens in each para
      # with torch.no_grad():
      #   x = torch.LongTensor(list(self.df[index:(index + 2*self.context+1)])).to(device)
      #   print(x.shape)    
      #   outputs = self.embed_model(x)
      #   print(outputs.shape) # (3, tokens(128), input_dim(3072))
       
      # return outputs.cpu(), self.y[index+self.context]

In [9]:
val_data = ContextEmbeddingDataset(val_file, context = 1, pretrained_model = pretrained_model)
train_data = ContextEmbeddingDataset(train_file, context = 1, pretrained_model = pretrained_model)
test_data = ContextEmbeddingDataset(test_file, context = 1, pretrained_model = pretrained_model)

train_args = dict(shuffle=True, batch_size=batch_size, num_workers=8, pin_memory=True, drop_last=False) if cuda else dict(shuffle=True, batch_size=batch_size, drop_last=False)
train_loader = DataLoader(train_data, **train_args)

val_args = dict(shuffle=False, batch_size=batch_size, num_workers=8, pin_memory=True, drop_last=False) if cuda else dict(shuffle=False, batch_size=batch_size, drop_last=False)
val_loader = DataLoader(val_data, **val_args)


test_args = dict(shuffle=False, batch_size=batch_size, num_workers=8, pin_memory=True, drop_last=False) if cuda else dict(shuffle=False, batch_size=batch_size, drop_last=False)
test_loader = DataLoader(test_data, **test_args)

## Fixed Bert word Embeddings, BiLSTM encoder, Triplet Decoder

In [10]:
class BertEmbedding(nn.Module):
  def __init__(self, pretrained_model):
    super().__init__()
    self.model = BertModel.from_pretrained(pretrained_model, output_hidden_states = True)
    # for param in self.model.bert.parameters():
    #   param.requires_grad = False
    # print(sum(p.numel() for p in self.model.parameters()))

  def forward(self, x):
    # print("Input to BertEmbedding: ", x.shape)
    outputs = self.model(x)
    hidden_states = outputs[2]
    embedding = torch.cat((hidden_states[-1],hidden_states[-2],hidden_states[-3],hidden_states[-4]), dim = 2)
    # print("Output from BertEmbedding: ", embedding.shape)
    return embedding

class ParaEncoderForContext(nn.Module):
  def __init__(self, bilayers = 1, input_dim = 3072, hidden_size = 512):
    super().__init__()
    self.input_dim = input_dim
    self.hidden_dim = hidden_size
    self.lstm = nn.LSTM(
            input_size=input_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=True, bidirectional=True)
    
    for name, param in self.lstm.named_parameters():
      if 'bias' in name:
        nn.init.constant(param, 0.0)
      elif 'weight' in name:
        nn.init.orthogonal(param)
     

  def forward(self, x): # (B*T(T=1+2*context), tokens, input_dim)
    # print("Input to Encoder: ",x.shape)
    outputs, _ = self.lstm(x) # (B*T, tokens, 2*hidden_dim)
    # print("After LSTM: ", outputs.shape)
    first = outputs[:, 0, self.hidden_dim:]
    second = outputs[:, -1, :self.hidden_dim]
    para_embed = torch.cat((second,first), dim = 1) #(B*T, 2*hidden_dim)

    # print("Output from Encoder", para_embed.shape)
    return para_embed #(B*T, 2*hidden_dim)



class ParaDecoderTriplet(nn.Module):
  def __init__(self, input_size, output_size = 1):
    super().__init__()
    self.linear = nn.Linear(input_size, 1, bias= True);
    # self.layers = nn.Sequential(nn.Linear(input_size, output_size, bias = True), 
    #                             nn.BatchNorm1d(output_size), 
    #                             nn.ReLU(inplace = True), 
    #                             nn.Linear(output_size, 1, bias = True))
    
    # for mod in self.modules():
    #   if isinstance(mod, nn.BatchNorm1d):
    #     nn.init.constant_(mod.weight.data, 1)
    #     if(mod.bias is not None):
    #       nn.init.constant_(mod.bias.data, 0)

  def forward(self, x): # #(B, T, 2*hidden_dim)
    # print("Input to decoder: ", x.shape) 
    s0,s1,s2 = x.shape
    x = x.reshape(s0,-1) #concat main and context para embeddings
    # print("Input to linear layer in decoder:", xv.shape) 
    return self.linear(x) #(B,1)

class EncoderDecoderTriplet(nn.Module):
  def __init__(self, embed_model, decoder_output_size = 1, encoder_bilayers = 1, encoder_input_dim = 3072, encoder_hidden_size = 512, context = 1):
    super().__init__()
    self.para_encoder = ParaEncoderForContext(bilayers = encoder_bilayers, input_dim = encoder_input_dim, hidden_size = encoder_hidden_size)
    self.para_decoder = ParaDecoderTriplet(input_size = encoder_hidden_size*2*(1+2*context))
    self.embed_model = embed_model
    #freeze bert embedding layer
    for param in self.embed_model.parameters():
      param.requires_grad = False

  def forward(self, x): # (B, 2*context+1, tokens_per_para)
    # print("Input to model: ", x.shape)
    s0, s1, s2 = x.shape
    xv = x.view(s0*s1, s2)
    embeds = self.embed_model(xv)
    para_vec = self.para_encoder(embeds)
    pvv = para_vec.view(s0, s1, -1)
    # print("Input to decoder: ", pvv.shape)
    return self.para_decoder(pvv)

## Train and Validate Functions

In [11]:
def train(para_model, data_loader):
  para_model.train()

  avg_loss = []
  all_predictions = []
  all_targets = []
  start = time.time()

  for i, (x, y) in enumerate(tqdm(data_loader, desc="Epoch", leave=False)):
    optimizer.zero_grad()
    y  = y.to(device) 
    x = x.to(device)
 
    output = para_model(x)

    # print("Output from model: ", output.shape)  

    loss = criterion(torch.squeeze(output), y.float())
    avg_loss.extend([loss.item()]*len(y))

    output = nn.Sigmoid()(output)

    all_predictions.extend((output >= 0.5).tolist())
    all_targets.extend(y.tolist())

    loss.backward()
    optimizer.step()
    scheduler.step()
    
    
  end = time.time()
  avg_loss = np.mean(avg_loss)
  print('learning_rate: {}'.format(scheduler.get_last_lr()))
  print('Training loss: {:.2f}, Time: {}'.format(avg_loss, end-start))
  
  all_predictions = np.array(all_predictions)
  all_targets = np.array(all_targets)
  scores = precision_recall_fscore_support(all_targets, all_predictions, 
                                            average="weighted", zero_division=0.)
  
  test_scores={
      "eval_accuracy": (all_predictions == all_targets).sum() / len(all_predictions),
      "eval_precision": scores[0],
      "eval_recall": scores[1],
      "eval_f-1": scores[2]
  }
  print(test_scores)
  return test_scores["eval_f-1"]


In [25]:
def validate(para_model, data_loader):
  para_model.eval()
  
  avg_loss = []
  all_predictions = []
  all_targets = []
  start = time.time()

  for i, (x, y) in enumerate(tqdm(data_loader, desc="Epoch", leave=False)):
    # optimizer.zero_grad()

    y = y.to(device)
    x = x.to(device)

    with torch.no_grad():
      output = para_model(x)

      loss = criterion(torch.squeeze(output), y.float())
      avg_loss.extend([loss.item()]*len(y))

      output = nn.Sigmoid()(output)

      all_predictions.extend((output >= 0.5).tolist())
      all_targets.extend(y.tolist())
    

    
  end = time.time()
  avg_loss = np.mean(avg_loss)
  print('learning_rate: {}'.format(scheduler.get_last_lr()))
  print('Validation loss: {:.2f}, Time: {}'.format(avg_loss, end-start))
  
  all_predictions = np.array(all_predictions)
  all_targets = np.array(all_targets)
  scores = precision_recall_fscore_support(all_targets, all_predictions, 
                                            average="weighted", zero_division=0.)
  
  test_scores={
      "eval_accuracy": (all_predictions == all_targets).sum() / len(all_predictions),
      "eval_precision": scores[0],
      "eval_recall": scores[1],
      "eval_f-1": scores[2]
  }
  print(test_scores)
  return test_scores["eval_f-1"], all_predictions


In [26]:
def save(model, acc, best=""):
    if not os.path.exists('./bert_base_triplet/'):
        os.mkdir('./bert_base_triplet/')

    torch.save(model.state_dict(), './bert_base_triplet/'+'/{}model_params_{}.pth'.format(best, acc))
    

## Main

In [27]:
model = EncoderDecoderTriplet(embed_model = BertEmbedding(pretrained_model))
model.load_state_dict(torch.load('./bert_base_triplet/model_model_params_0.9463038575553862.pth'))
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
non_trainable_total_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
print("Total params: ", total_params)
print("Trainable params: ", trainable_total_params)
print("Non Trainable params: ", non_trainable_total_params)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Total params:  124173569
Trainable params:  14691329
Non Trainable params:  109482240


In [16]:
epochs = 20
lamda = 1e-4  #L2 regularization
learning_rate = 0.01

criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=lamda)
# optimizer.load_state_dict(torch.load('./bert_base_triplet/model_model_params_0.9463038575553862.pth'))    

# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[i for i in range(4,20,4)], gamma=0.75)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * epochs))

In [18]:
best_val_f1 = 0
for epoch in range(epochs):
  print('Epoch #{}'.format(epoch+1))
  
  train_f1 = train(model, train_loader)
  val_f1 = validate(model, val_loader)
  
  if val_f1 > best_val_f1:
    best_val_f1 = val_f1
    save(model, best_val_f1, best = "model_")
    save(optimizer, best_val_f1, best = "optimizer_")

  

Epoch #1


                                                                                                                        

learning_rate: [0.009938441702975686]
Training loss: 0.14, Time: 1282.683581352234
{'eval_accuracy': 27794.043785214853, 'eval_precision': 0.9436408794746415, 'eval_recall': 0.9438372592944584, 'eval_f-1': 0.9437294348809004}


                                                                                                                        

learning_rate: [0.009938441702975686]
Validation loss: 0.22, Time: 218.57729744911194
{'eval_accuracy': 5455.185137732223, 'eval_precision': 0.9216137421868582, 'eval_recall': 0.9143711296177663, 'eval_f-1': 0.9161539592897534}
Epoch #2


                                                                                                                        

learning_rate: [0.00024471741852423256]
Training loss: 0.06, Time: 1355.2685902118683
{'eval_accuracy': 27710.720521349536, 'eval_precision': 0.9812671260994843, 'eval_recall': 0.981241373068133, 'eval_f-1': 0.9812531334618015}


                                                                                                                        

learning_rate: [0.00024471741852423256]
Validation loss: 0.20, Time: 200.84149169921875
{'eval_accuracy': 5589.968182788811, 'eval_precision': 0.9431553240609312, 'eval_recall': 0.9422378816997651, 'eval_f-1': 0.942571255932125}
Epoch #19


                                                                                                                        

learning_rate: [6.155829702431176e-05]
Training loss: 0.05, Time: 1326.984935760498
{'eval_accuracy': 27702.13570022402, 'eval_precision': 0.9834701128404973, 'eval_recall': 0.9834362908152875, 'eval_f-1': 0.9834509620053837}


                                                                                                                        

learning_rate: [6.155829702431176e-05]
Validation loss: 0.20, Time: 217.57310342788696
{'eval_accuracy': 5588.615844544096, 'eval_precision': 0.9447732479954126, 'eval_recall': 0.9438394191757421, 'eval_f-1': 0.9441730126427105}
Epoch #20


                                                                                                                        

learning_rate: [0.0]
Training loss: 0.05, Time: 1324.8539867401123
{'eval_accuracy': 27706.680605525762, 'eval_precision': 0.9846166028923713, 'eval_recall': 0.9845903197338945, 'eval_f-1': 0.9846018586348805}


                                                                                                                        

learning_rate: [0.0]
Validation loss: 0.20, Time: 216.82944703102112
{'eval_accuracy': 5585.911168054666, 'eval_precision': 0.9439990602925357, 'eval_recall': 0.942985265855221, 'eval_f-1': 0.9433430824981578}


In [17]:
# Test on Test Set

In [28]:
_, predictions = validate(model, test_loader)

                                                                                                                

learning_rate: [0.01]
Validation loss: 0.25, Time: 283.2222058773041
{'eval_accuracy': 8525.18222084078, 'eval_precision': 0.912120269830713, 'eval_recall': 0.9139793301732846, 'eval_f-1': 0.9113673431309153}


In [34]:
p = predictions.reshape(-1)

In [35]:
temp = predictions
print(len(predictions))
predictions = np.concatenate([[0], p, [0]])
print(len(predictions))

12869
12871


In [36]:
test_df = pd.read_csv(test_file)
test_df['predictions'] = predictions
error = test_df[test_df['label'] != test_df['predictions']]
print((len(test_df)- len(error)) / len(test_df))

0.9139926967601585


In [None]:
print(len(error))
error.to_csv("errors_bert_embed_triplet_binary.csv")

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [None]:
error