<a href="https://colab.research.google.com/github/architb1703/Toxic_Span/blob/archit/BERT_BiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers==2.6.0
!pip install seqeval



In [2]:
!pip install urllib3 --upgrade

Requirement already up-to-date: urllib3 in /usr/local/lib/python3.6/dist-packages (1.25.11)


In [3]:
#Based on the Bert for NER from https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm, trange

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchtext import data
import transformers
from transformers import BertTokenizer, BertConfig, BertModel, AdamW
from transformers import get_linear_schedule_with_warmup

from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from seqeval.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix

torch.manual_seed(42)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
train_path = '/content/drive/My Drive/ToxicSpan_CS669V/processed/finaltrain.pkl'
val_path = '/content/drive/My Drive/ToxicSpan_CS669V/processed/finaldev.pkl'

with open(train_path, 'rb') as f:
  train_data = pickle.load(f)
  f.close()

with open(val_path, 'rb') as f:
  val_data = pickle.load(f)
  f.close()

In [5]:
MAX_LEN = 300
BATCH_SIZE = 1

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = False)

In [7]:
X_train = train_data['token_final']
X_val = val_data['token_final']
Y_train = train_data['target_final']
Y_val = val_data['target_final']

In [8]:
CLASSES = {'0':0, '1':1, '[PAD]':2}

In [9]:
def tokenize_bert(x, y):
  sentence = []
  labels = []
  for word, label in zip(x, y):
    tokenized_word = tokenizer.tokenize(word)
    sentence.extend(tokenized_word)
    labels.extend([label for i in range(len(tokenized_word))])
  return(sentence, labels)

In [10]:
len_train = len(X_train)
len_val = len(X_val)

for i in range(len_train):
  X_train[i], Y_train[i] = tokenize_bert(X_train[i], Y_train[i])

for i in range(len_val):
  X_val[i], Y_val[i] = tokenize_bert(X_val[i], Y_val[i])

In [11]:
ones = 0
zeros = 0
total = 0
for y in Y_train:
  ones += np.sum(np.array(y))
  zeros += len(y) - np.sum(np.array(y))
  total += len(y)
for y in Y_val:
  ones += np.sum(np.array(y))
  zeros += len(y) - np.sum(np.array(y))
  total += len(y)
# print(ones, zeros)
class_weights = torch.tensor([zeros/zeros, zeros/ones], dtype=torch.float32)

In [12]:
X_train_id = pad_sequences([tokenizer.convert_tokens_to_ids(text) for text in X_train], maxlen = MAX_LEN, dtype='long', value=tokenizer.pad_token_id, truncating='post', padding = 'post')
Y_train_id = pad_sequences(Y_train, maxlen=MAX_LEN, value=CLASSES['[PAD]'], dtype='long', truncating='post', padding='post')
X_val_id = pad_sequences([tokenizer.convert_tokens_to_ids(text) for text in X_val], maxlen = MAX_LEN, dtype='long', value=tokenizer.pad_token_id, truncating='post', padding = 'post')
Y_val_id = pad_sequences(Y_val, maxlen=MAX_LEN, value=CLASSES['[PAD]'], dtype='long', truncating='post', padding='post')

In [13]:
def get_attention_mask(x):
  return([[(i!=0) for i in text] for text in x])

In [14]:
attention_mask_train = get_attention_mask(X_train_id)
attention_mask_val = get_attention_mask(X_val_id)

In [15]:
X_train_id = torch.tensor(X_train_id)
Y_train_id = torch.tensor(Y_train_id)
X_val_id = torch.tensor(X_val_id)
Y_val_id = torch.tensor(Y_val_id)
attention_mask_train = torch.tensor(attention_mask_train)
attention_mask_val = torch.tensor(attention_mask_val)

In [16]:
train_data = TensorDataset(X_train_id, attention_mask_train, Y_train_id)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

val_data = TensorDataset(X_val_id, attention_mask_val, Y_val_id)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

In [17]:
class BiLSTM(nn.Module):
  def __init__(self, input_dim, num_labels, hidden_dim, lstm_layers, dropout):
    super().__init__()
    self.input_dim = input_dim
    self.num_labels = num_labels
    self.hidden_dim = hidden_dim
    self.bilstm = nn.LSTM(input_dim, hidden_dim, lstm_layers, bidirectional=True, dropout=dropout, batch_first=True)
    self.fc = nn.Linear(hidden_dim*2, num_labels)

  def forward(self, text, text_lengths):
    inputs = nn.utils.rnn.pack_padded_sequence(text, text_lengths, batch_first=True, enforce_sorted=False)
    lstm_out, _ = self.bilstm(inputs)
    # out_unpacked, out_lengths = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True, total_length=300)
    out = self.fc(lstm_out[0])
    return out

In [18]:
bert_model = BertModel.from_pretrained('bert-base-uncased', output_attentions = False, output_hidden_states = False)
bert_model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [19]:
hidden_dim = bert_model.config.hidden_size
lstm_hidden_dim = 64
dropout = bert_model.config.hidden_dropout_prob
num_labels = 2
model = BiLSTM(hidden_dim, num_labels, lstm_hidden_dim, 2, dropout)
model.cuda()

BiLSTM(
  (bilstm): LSTM(768, 64, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [20]:
FINE_TUNING = True
if FINE_TUNING:
  param_optimizer_base = list(model.named_parameters())
  param_optimizer_bert = list(bert_model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
                                  {'params' : [p for n,p in param_optimizer_bert if not any(nd in n for nd in no_decay)], 'weight_decay_rate' : 0.01},
                                  {'params': [p for n, p in param_optimizer_bert if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0},
                                  {"params": [p for n, p in param_optimizer_base], 'weight_decay_rate' : 0.00}]
else:
  param_optimizer = list(model.named_parameters())
  optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer], 'weight_decay_rate' :0.00}]

optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

In [21]:
epochs = 10
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [22]:
def get_text_lengths(masks):
  lengths = []
  for mask in masks:
    lengths.append(torch.sum(mask).item())
  return(lengths)

In [23]:
criterion = nn.CrossEntropyLoss(weight=class_weights)
criterion = criterion.to(device)

In [24]:
class_weights

tensor([1.0000, 9.9122])

In [None]:
train_loss, val_loss = [], []
train_acc, val_acc = [], []
train_f1, val_f1 = [], []

threshold = 0.5

l = 100

for epoch in trange(epochs, desc = 'Epoch'):
  model.train()
  t_loss, t_acc = 0, 0
  predictions, true_labels = [], []
  count = 0
  optimizer.zero_grad()
  for step, batch in enumerate(train_dataloader):
    count += 1 
    batch = tuple(t.to(device) for t in batch)
    b_input_id, b_input_mask, b_labels = batch
    outputs = bert_model(b_input_id, token_type_ids=None, attention_mask=b_input_mask)
    text_lengths = get_text_lengths(b_input_mask)
    out = model(outputs[0], torch.tensor(text_lengths, device='cuda', dtype=torch.int64, requires_grad=False))
    labels = torch.tensor([l_i for l in b_labels for l_i in l if l_i!=2], dtype=torch.long  , device=device)
    loss = criterion(out, labels)
    loss.backward()
    
    t_loss += loss.item()
    torch.nn.utils.clip_grad_norm_(parameters=bert_model.parameters(), max_norm=max_grad_norm)
    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
    if(count%8==0):
      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()

    logits = torch.argmax(F.softmax(out, dim=1), dim=1).to('cpu').numpy()
    label_ids = labels.to('cpu').numpy()
    # print(logits, label_ids)
    predictions.extend(logits)
    true_labels.extend(label_ids)

  print(f"Train Loss : {t_loss/len(train_dataloader)}")
  train_loss.append(t_loss/len(train_dataloader))

  train_acc.append(accuracy_score(predictions, true_labels))
  train_f1.append(f1_score(predictions, true_labels))
  print("Train Accuracy: {}".format(accuracy_score(predictions, true_labels)))
  print("Train F1-Score: {}".format(f1_score(predictions, true_labels)))
  print()

  model.eval()
  v_loss, v_accuracy = 0, 0
  predictions , true_labels = [], []
  for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_id, b_input_mask, b_labels = batch
    with torch.no_grad():
      outputs = bert_model(b_input_id, token_type_ids=None, attention_mask=b_input_mask)
      text_lengths = get_text_lengths(b_input_mask)
      out = model(outputs[0], torch.tensor(text_lengths, device='cuda', dtype=torch.int64, requires_grad=False))
    
    labels = torch.tensor([l_i for l in b_labels for l_i in l if l_i!=2], dtype=torch.long  , device=device)
    loss = criterion(out, labels)
        
    logits = torch.argmax(F.softmax(out, dim=1), dim=1).to('cpu').numpy()
    label_ids = labels.to('cpu').numpy()
    predictions.extend(logits)
    true_labels.extend(label_ids)

    v_loss += loss
    
  v_loss = v_loss/len(val_dataloader)
  val_loss.append(v_loss)
  if(v_loss < l):
    l = v_loss
    torch.save(model, '/content/drive/My Drive/model14.pt')
    print("Model Checkpoint")
  print(f"Validation Loss : {v_loss}")
  print("Validation Accuracy: {}".format(accuracy_score(predictions, true_labels)))
  print("Validation F1-Score: {}".format(f1_score(predictions, true_labels)))
  val_acc.append(accuracy_score(predictions, true_labels))
  val_f1.append(f1_score(predictions, true_labels))
  print()

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
