<a href="https://colab.research.google.com/github/architb1703/Toxic_Span/blob/archit/BERT_CRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Library installs

!pip install transformers==2.6.0
!pip install seqeval
!pip install urllib3 --upgrade
!pip install pytorch-crf

In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm, trange

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchtext import data
from torchcrf import CRF
import transformers
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, AdamW, BertModel
from transformers import get_linear_schedule_with_warmup

from keras.preprocessing.sequence import pad_sequences
from seqeval.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix

torch.manual_seed(42)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



In [None]:
#Load data
train_path = '/content/drive/My Drive/ToxicSpan_CS669V/BERT_Preprocess/train.pkl'
val_path = '/content/drive/My Drive/ToxicSpan_CS669V/BERT_Preprocess/val.pkl'

with open(train_path, 'rb') as f:
  train_data = pickle.load(f)
  f.close()

with open(val_path, 'rb') as f:
  val_data = pickle.load(f)
  f.close()

In [None]:
X_train = train_data['token_final']
X_val = val_data['token_final']
Y_train = train_data['target_final']
Y_val = val_data['target_final']

In [None]:
MAX_LEN = 500
BATCH_SIZE = 8
CLASSES = {'0':0, '1':1, '[PAD]':2}

In [None]:
bert_model = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case = False)

In [None]:
def tokenize_bert(x, y):
  sentence = []
  labels = [0]
  for word, label in zip(x, y):
    tokenized_word = tokenizer.tokenize(word)
    sentence.extend(tokenized_word)
    labels.extend([label for i in range(len(tokenized_word))])
  labels.append(0)
  return(sentence, labels)

In [None]:
#Tokenize the data using bert tokenizer which is based on WordPiece tokenization
len_train = len(X_train)
len_val = len(X_val)

for i in range(len_train):
  X_train[i], Y_train[i] = tokenize_bert(X_train[i], Y_train[i])

for i in range(len_val):
  X_val[i], Y_val[i] = tokenize_bert(X_val[i], Y_val[i])

In [None]:
#Pad the input data so that we can deal with them as tensors
X_train_id = pad_sequences([tokenizer.encode(text) for text in X_train], maxlen = MAX_LEN, dtype='long', value=0.0, truncating='post', padding = 'post')
Y_train_id = pad_sequences(Y_train, maxlen=MAX_LEN, value=0, dtype='long', truncating='post', padding='post')
X_val_id = pad_sequences([tokenizer.encode(text) for text in X_val], maxlen = MAX_LEN, dtype='long', value=0.0, truncating='post', padding = 'post')
Y_val_id = pad_sequences(Y_val, maxlen=MAX_LEN, value=0, dtype='long', truncating='post', padding='post')

In [None]:
def get_attention_mask(x):
  return([[(i!=0) for i in text] for text in x])

In [None]:
#Generating masks
attention_mask_train = get_attention_mask(X_train_id)
attention_mask_val = get_attention_mask(X_val_id)

In [None]:
X_train_id = torch.tensor(X_train_id)
Y_train_id = torch.tensor(Y_train_id)
X_val_id = torch.tensor(X_val_id)
Y_val_id = torch.tensor(Y_val_id)
attention_mask_train = torch.tensor(attention_mask_train)
attention_mask_val = torch.tensor(attention_mask_val)

In [None]:
#Creating dataloaders for train and val data, this will allow us to easily get batches during training

train_data = TensorDataset(X_train_id, attention_mask_train, Y_train_id)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

val_data = TensorDataset(X_val_id, attention_mask_val, Y_val_id)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

In [None]:
#Defining model for BERT-CRF and BERT-BiLSTM-CRF
#Parameters to class : 
#     bert_model : Name of pretrained bert model to use
#     num_labels : No. of classes
#     bilstm : Whether to include the bilstm layer after bert model

class BertCRFModel(nn.Module):
  def __init__(self, bert_model, num_labels, bilstm):
    super(BertCRFModel, self).__init__()
    self.bert_model = bert_model
    self.num_labels = num_labels
    self.bert = BertModel.from_pretrained(self.bert_model, output_attentions=False, output_hidden_states=False)
    self.crf = CRF(self.num_labels, batch_first=True)
    self.dropout = nn.Dropout(0.7)
    if(bilstm):
      self.bilstm = nn.LSTM(self.bert.config.hidden_size, self.num_labels, bidirectional=True, num_layers=1, batch_first=True)
      self.fc = nn.Linear(self.num_labels*2, self.num_labels)
    else:
      self.fc = nn.Linear(self.bert.config.hidden_size, self.num_labels)
    
  def forward(self, inputs, masks, labels=None, bilstm=False):
    outputs = self.bert(inputs, masks)
    seq_out = outputs[0]
    
    if(not bilstm):
      x = self.fc(seq_out)
      seq_out = self.dropout(seq_out)
    else:
      seq_out = nn.utils.rnn.pack_padded_sequence(seq_out, torch.tensor([torch.sum(a) for a in masks]), batch_first=True, enforce_sorted=False)
      x, (h_n, c_n) = self.bilstm(seq_out)
      x, _ = nn.utils.rnn.pad_packed_sequence(x, total_length=500, batch_first=True)
      x = self.fc(x)

    masks = masks.type(torch.uint8)
    if(labels is not None):
      loss = -self.crf(F.log_softmax(x, dim=2), labels, mask=masks, reduction='mean')
      preds = self.crf.decode(x, mask=masks)
      return loss, preds
    else:
      preds = self.crf.decode(x, mask=masks)
      return preds

In [None]:
NUM_LABELS = 2
model = BertCRFModel(bert_model, NUM_LABELS, False)
# model = torch.load('/content/drive/My Drive/model5.pt')
if torch.cuda.is_available():
  model.cuda()

In [None]:
#Initialize AdamW optimizer with weight decay for regularization
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
                                {'params' : [p for n,p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate' : 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

optimizer_grouped_parameters_no_bert = [
                                {'params' : [p for n,p in param_optimizer if 'bert' not in n], 'weight_decay_rate' : 0.01},
                                ]
optimizer_initial = AdamW(optimizer_grouped_parameters_no_bert, lr=3e-5, eps=1e-8)

In [None]:
#Initialize scheduler to perform learning rate decay
epochs = 10
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer_initial,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

scheduler_2 = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
l = 100

In [None]:
#Code for training model and evaluating on validaition data

train_loss, val_loss = [], []
train_acc, val_acc = [], []
train_f1, val_f1 = [], []

for epoch in trange(epochs, desc = 'Epoch'):
  model.train()
  t_loss, t_acc = 0, 0
  predictions, true_labels = [], []
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_id, b_input_mask, b_labels = batch
    model.zero_grad()
    loss, preds = model(b_input_id, b_input_mask, b_labels, False)
    loss.backward()
    t_loss += loss.item()
    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
    
    #Used to perform the first round of training in which the bert model is frozen the rest of the model is trained to convergence
    # optimizer_initial.step()
    # scheduler.step()
     
    #Used to perform the second round of training in which the bert model is unfrozen and whole model is finetuned
    optimizer.step()
    scheduler_2.step()
    
    label_ids = b_labels.to('cpu').numpy()
    predictions.extend([p for p in preds])
    true_labels.extend([l_i.item() for l,x in zip(b_labels, b_input_id) for l_i,x_i in zip(l,x) if x_i!=0])
    
  print(f"Train Loss : {t_loss/len(train_dataloader)}")
  train_loss.append(t_loss/len(train_dataloader))
  pred_tags = [p_i for p in predictions for p_i in p]
  valid_tags = true_labels
  # valid_tags = [l_i for l in true_labels
  #                               for l_i in l if l_i != 2]
  train_acc.append(accuracy_score(pred_tags, valid_tags))
  train_f1.append(f1_score(pred_tags, valid_tags))
  print("Train Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
  print("Train F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
  print()

  #Evaluation on val data
  model.eval()
  v_loss, v_accuracy = 0, 0
  predictions , true_labels = [], []
  for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_id, b_input_mask, b_labels = batch
    with torch.no_grad():
      loss, preds = model(b_input_id, b_input_mask, b_labels, False)
      
    v_loss += loss.item()
    label_ids = b_labels.to('cpu').numpy()
    predictions.extend([p for p in preds])
    true_labels.extend([l_i.item() for l,x in zip(b_labels, b_input_id) for l_i,x_i in zip(l,x) if x_i!=0])
    
  v_loss = v_loss/len(val_dataloader)
  val_loss.append(v_loss)
  if(v_loss < l):
    l = v_loss
    print("Model Checkpoint")
  torch.save(model, f'/content/drive/My Drive/model{epoch}.pt')
    
  print(f"Validation Loss : {v_loss}")
  pred_tags = [p_i for p in predictions for p_i in p]
  valid_tags = true_labels
  print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
  print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
  val_acc.append(accuracy_score(pred_tags, valid_tags))
  val_f1.append(f1_score(pred_tags, valid_tags))
  print()