<a href="https://colab.research.google.com/github/architb1703/Toxic_Span/blob/archit/BERT_CRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Library installs

!pip install transformers==2.6.0
!pip install seqeval
!pip install urllib3 --upgrade
!pip install pytorch-crf

Collecting transformers==2.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/4c/a0/32e3a4501ef480f7ea01aac329a716132f32f7911ef1c2fac228acc57ca7/transformers-2.6.0-py3-none-any.whl (540kB)
[K     |████████████████████████████████| 542kB 7.0MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 22.1MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 40.8MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K

In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm, trange

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchtext import data
from torchcrf import CRF
import transformers
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, AdamW, BertModel
from transformers import get_linear_schedule_with_warmup

from keras.preprocessing.sequence import pad_sequences
from seqeval.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix

torch.manual_seed(42)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#Load data
train_path = '/content/drive/My Drive/ToxicSpan_CS669V/processed/finaltrain.pkl'
val_path = '/content/drive/My Drive/ToxicSpan_CS669V/processed/finaldev.pkl'

with open(train_path, 'rb') as f:
  train_data = pickle.load(f)
  f.close()

with open(val_path, 'rb') as f:
  val_data = pickle.load(f)
  f.close()

In [None]:
X_train = train_data['token_final']
X_val = val_data['token_final']
Y_train = train_data['target_final']
Y_val = val_data['target_final']

In [None]:
MAX_LEN = 250
BATCH_SIZE = 32
CLASSES = {'0':0, '1':1, '[PAD]':2}

In [None]:
bert_model = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case = False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
def tokenize_bert(x, y):
  sentence = []
  labels = []
  for word, label in zip(x, y):
    tokenized_word = tokenizer.tokenize(word)
    sentence.extend(tokenized_word)
    labels.extend([label for i in range(len(tokenized_word))])
  return(sentence, labels)

In [None]:
#Tokenize the data using bert tokenizer which is based on WordPiece tokenization
len_train = len(X_train)
len_val = len(X_val)

for i in range(len_train):
  X_train[i], Y_train[i] = tokenize_bert(X_train[i], Y_train[i])

for i in range(len_val):
  X_val[i], Y_val[i] = tokenize_bert(X_val[i], Y_val[i])

In [None]:
#Pad the input data so that we can deal with them as tensors
X_train_id = pad_sequences([tokenizer.convert_tokens_to_ids(text) for text in X_train], maxlen = MAX_LEN, dtype='long', value=0.0, truncating='post', padding = 'post')
Y_train_id = pad_sequences(Y_train, maxlen=MAX_LEN, value=0, dtype='long', truncating='post', padding='post')
X_val_id = pad_sequences([tokenizer.convert_tokens_to_ids(text) for text in X_val], maxlen = MAX_LEN, dtype='long', value=0.0, truncating='post', padding = 'post')
Y_val_id = pad_sequences(Y_val, maxlen=MAX_LEN, value=0, dtype='long', truncating='post', padding='post')

In [None]:
def get_attention_mask(x):
  return([[(i!=0) for i in text] for text in x])

In [None]:
attention_mask_train = get_attention_mask(X_train_id)
attention_mask_val = get_attention_mask(X_val_id)

In [None]:
X_train_id = torch.tensor(X_train_id)
Y_train_id = torch.tensor(Y_train_id)
X_val_id = torch.tensor(X_val_id)
Y_val_id = torch.tensor(Y_val_id)
attention_mask_train = torch.tensor(attention_mask_train)
attention_mask_val = torch.tensor(attention_mask_val)

In [None]:
#Creating dataloaders for train and val data, this will allow us to easily get batches during training

train_data = TensorDataset(X_train_id, attention_mask_train, Y_train_id)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

val_data = TensorDataset(X_val_id, attention_mask_val, Y_val_id)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

In [None]:
class BertCRFModel(nn.Module):
  def __init__(self, bert_model, num_labels, bilstm):
    super(BertCRFModel, self).__init__()
    self.bert_model = bert_model
    self.num_labels = num_labels
    self.bert = BertModel.from_pretrained(self.bert_model, output_attentions=False, output_hidden_states=False)
    self.crf = CRF(self.num_labels, batch_first=True)
    self.dropout = nn.Dropout(0.7)
    if(bilstm):
      self.bilstm = nn.LSTM(self.bert.config.hidden_size, self.num_labels, bidirectional=True, num_layers=1, batch_first=True)
      self.fc = nn.Linear(self.num_labels*2, self.num_labels)
    else:
      self.fc = nn.Linear(self.bert.config.hidden_size, self.num_labels)
    
  def forward(self, inputs, masks, labels=None, bilstm=False):
    outputs = self.bert(inputs, masks)
    seq_out = outputs[0]
    
    if(not bilstm):
      x = self.fc(seq_out)
      seq_out = self.dropout(seq_out)
    else:
      seq_out = nn.utils.rnn.pack_padded_sequence(seq_out, torch.tensor([torch.sum(a) for a in masks]), batch_first=True, enforce_sorted=False)
      x, (h_n, c_n) = self.bilstm(seq_out)
      x, _ = nn.utils.rnn.pad_packed_sequence(x, total_length=250, batch_first=True)
      x = self.fc(x)

    masks = masks.type(torch.uint8)
    if(labels is not None):
      loss = -self.crf(F.log_softmax(x, dim=2), labels, mask=masks, reduction='mean')
      preds = self.crf.decode(x, mask=masks)
      return loss, preds
    else:
      preds = self.crf.decode(x, mask=masks)
      return preds

In [None]:
NUM_LABELS = 2
model = BertCRFModel(bert_model, NUM_LABELS, False)
if torch.cuda.is_available():
  model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
                                {'params' : [p for n,p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate' : 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

In [None]:
epochs = 10
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
#Code for training model and evaluating on validaition data

train_loss, val_loss = [], []
train_acc, val_acc = [], []
train_f1, val_f1 = [], []

l = 100

for epoch in trange(epochs, desc = 'Epoch'):
  model.train()
  t_loss, t_acc = 0, 0
  predictions, true_labels = [], []
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_id, b_input_mask, b_labels = batch
    model.zero_grad()
    loss, preds = model(b_input_id, b_input_mask, b_labels, False)
    loss.backward()
    t_loss += loss.item()
    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
    optimizer.step()
    scheduler.step()
    
    label_ids = b_labels.to('cpu').numpy()
    predictions.extend([p for p in preds])
    true_labels.extend([l_i.item() for l,x in zip(b_labels, b_input_id) for l_i,x_i in zip(l,x) if x_i!=0])
    
  print(f"Train Loss : {t_loss/len(train_dataloader)}")
  train_loss.append(t_loss/len(train_dataloader))
  pred_tags = [p_i for p in predictions for p_i in p]
  valid_tags = true_labels
  # valid_tags = [l_i for l in true_labels
  #                               for l_i in l if l_i != 2]
  train_acc.append(accuracy_score(pred_tags, valid_tags))
  train_f1.append(f1_score(pred_tags, valid_tags))
  print("Train Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
  print("Train F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
  print()

  #Evaluation on val data
  model.eval()
  v_loss, v_accuracy = 0, 0
  predictions , true_labels = [], []
  for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_id, b_input_mask, b_labels = batch
    with torch.no_grad():
      loss, preds = model(b_input_id, b_input_mask, b_labels, False)
      
    v_loss += loss.item()
    label_ids = b_labels.to('cpu').numpy()
    predictions.extend([p for p in preds])
    true_labels.extend([l_i.item() for l,x in zip(b_labels, b_input_id) for l_i,x_i in zip(l,x) if x_i!=0])
    
  v_loss = v_loss/len(val_dataloader)
  val_loss.append(v_loss)
  if(v_loss < l):
    l = v_loss
    print("Model Checkpoint")
  torch.save(model, f'/content/drive/My Drive/model{epoch}.pt')
    
  print(f"Validation Loss : {v_loss}")
  pred_tags = [p_i for p in predictions for p_i in p]
  valid_tags = true_labels
  print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
  print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
  val_acc.append(accuracy_score(pred_tags, valid_tags))
  val_f1.append(f1_score(pred_tags, valid_tags))
  print()

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


Train Loss : 10.042623025089053
Train Accuracy: 0.9155129570465034
Train F1-Score: 0.3154434540742146

Model Checkpoint


Epoch:  10%|█         | 1/10 [11:13<1:41:04, 673.85s/it]

Validation Loss : 9.168487167358398
Validation Accuracy: 0.9217750191963143
Validation F1-Score: 0.4776757103183081

Train Loss : 8.425957201713294
Train Accuracy: 0.9307306112417933
Train F1-Score: 0.4927296100557063



Epoch:  20%|██        | 2/10 [22:30<1:29:57, 674.66s/it]

Validation Loss : 10.089014472961425
Validation Accuracy: 0.910065267468646
Validation F1-Score: 0.5022135647246325

Train Loss : 6.89994364287985
Train Accuracy: 0.9375416915353018
Train F1-Score: 0.5765965886552955



Epoch:  30%|███       | 3/10 [33:47<1:18:48, 675.43s/it]

Validation Loss : 10.092757225036621
Validation Accuracy: 0.9164320450473509
Validation F1-Score: 0.48297703879651627

Train Loss : 5.035844249341955
Train Accuracy: 0.9519245397839647
Train F1-Score: 0.7056181922415441



Epoch:  40%|████      | 4/10 [45:03<1:07:33, 675.51s/it]

Validation Loss : 11.070569038391113
Validation Accuracy: 0.9181597133350397
Validation F1-Score: 0.47667757774140757

Train Loss : 3.717344243322785
Train Accuracy: 0.9653204445536714
Train F1-Score: 0.8001079282277286



Epoch:  50%|█████     | 5/10 [56:19<56:18, 675.76s/it]  

Validation Loss : 12.948864574432372
Validation Accuracy: 0.9181597133350397
Validation F1-Score: 0.47667757774140757

Train Loss : 2.893977193377126
Train Accuracy: 0.9739688781222328
Train F1-Score: 0.8544824127177966



Epoch:  60%|██████    | 6/10 [1:07:36<45:04, 676.02s/it]

Validation Loss : 13.623296403884888
Validation Accuracy: 0.913552597901203
Validation F1-Score: 0.48533333333333334



KeyboardInterrupt: ignored