<a href="https://colab.research.google.com/github/architb1703/Toxic_Span/blob/Abhay/FusionBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers==2.6.0
!pip install seqeval



In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm, trange

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchtext import data
import transformers
from transformers import BertTokenizer, BertConfig, BertModel, AdamW, BertForTokenClassification, BertPreTrainedModel, BertConfig
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig, RobertaForTokenClassification

from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from seqeval.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix

In [None]:
np.random.seed(42)
torch.manual_seed(12)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
train_path = 'drive/My Drive/Ensemble/train.pkl'
val_path = 'drive/My Drive/Ensemble/val.pkl'

with open(train_path, 'rb') as f:
  train_data = pickle.load(f)
  f.close()

with open(val_path, 'rb') as f:
  val_data = pickle.load(f)
  f.close()

In [None]:
X_train = train_data['token_final']
X_val = val_data['token_final']
Y_train = train_data['target_final']
Y_val = val_data['target_final']

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)

In [None]:
def tokenize_bert(x, y):
  sentence = []
  labels = [0]
  for word, label in zip(x, y):
    tokenized_word = tokenizer.tokenize(word)
    sentence.extend(tokenized_word)
    labels.extend([label for i in range(len(tokenized_word))])
  labels.append(0)
  return(sentence, labels)

In [None]:
  len_train = len(X_train)
  len_val = len(X_val)

  for i in range(len_train):
    X_train[i], Y_train[i] = tokenize_bert(X_train[i], Y_train[i])

  for i in range(len_val):
    X_val[i], Y_val[i] = tokenize_bert(X_val[i], Y_val[i])

In [None]:
ones = 0
zeros = 0
total = 0
for y in Y_train:
  ones += np.sum(np.array(y))
  zeros += len(y) - np.sum(np.array(y))
  total += len(y)
for y in Y_val:
  ones += np.sum(np.array(y))
  zeros += len(y) - np.sum(np.array(y))
  total += len(y)
class_weights = torch.tensor([zeros/zeros, zeros/ones], dtype=torch.float32)

In [None]:
CLASSES = {'0':0, '1':1, '[PAD]':2}
MAX_LEN = 500
BATCH_SIZE = 4
#Convert tokens to token_ids for bert_model, add special tokens to the sequences, pad the sequences
X_train_id = pad_sequences([tokenizer.encode(text) for text in X_train], maxlen = MAX_LEN, dtype='long', value=0.0, truncating='post', padding = 'post')
Y_train_id = pad_sequences(Y_train, maxlen=MAX_LEN, value=CLASSES['[PAD]'], dtype='long', truncating='post', padding='post')
X_val_id = pad_sequences([tokenizer.encode(text) for text in X_val], maxlen = MAX_LEN, dtype='long', value=0.0, truncating='post', padding = 'post')
Y_val_id = pad_sequences(Y_val, maxlen=MAX_LEN, value=CLASSES['[PAD]'], dtype='long', truncating='post', padding='post')

In [None]:
def get_attention_mask(x):
  return([[(i!=0) for i in text] for text in x])

In [None]:
attention_mask_train = get_attention_mask(X_train_id)
attention_mask_val = get_attention_mask(X_val_id)

In [None]:
X_train_id = torch.tensor(X_train_id)
Y_train_id = torch.tensor(Y_train_id)
X_val_id = torch.tensor(X_val_id)
Y_val_id = torch.tensor(Y_val_id)
attention_mask_train = torch.tensor(attention_mask_train)
attention_mask_val = torch.tensor(attention_mask_val)

In [None]:
train_data = TensorDataset(X_train_id, attention_mask_train, Y_train_id)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

val_data = TensorDataset(X_val_id, attention_mask_val, Y_val_id)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

In [None]:
class BertEnsemble(BertPreTrainedModel):
  def __init__(self,config,*args,**kwargs):
    super().__init__(config)
    self.num_labels = config.num_labels
    self.bert_model_1 = BertModel.from_pretrained('bert-base-cased', output_attentions = False, output_hidden_states = False)
    self.bert_model_2 = BertModel.from_pretrained('bert-base-cased', output_attentions = False, output_hidden_states = False)
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    self.init_weights()
  def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        # output_attentions=None,
        # output_hidden_states=None,
        # return_dict=None,
    ):
    outputs = self.bert_model_1(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            # output_attentions=output_attentions,
            # output_hidden_states=output_hidden_states,
            # return_dict=return_dict,
        )
    outputs_2 = self.bert_model_2(
          input_ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids,
          position_ids=position_ids,
          head_mask=head_mask,
          inputs_embeds=inputs_embeds,
          # output_attentions=output_attentions,
          # output_hidden_states=output_hidden_states,
          # return_dict=return_dict,
      )
    sequence_output = outputs[0]
    sequence_output2 = outputs_2[0]
    sequence_output = self.dropout(sequence_output)
    sequence_output2 = self.dropout(sequence_output2)
    logits = self.classifier(sequence_output+sequence_output2)

    loss = None
    if labels is not None:
        loss_fct = nn.CrossEntropyLoss()
        # Only keep active parts of the loss
        if attention_mask is not None:
            active_loss = attention_mask.view(-1) == 1
            active_logits = logits.view(-1, self.num_labels)
            active_labels = torch.where(
                active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
            )
            loss = loss_fct(active_logits, active_labels)
        else:
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

  
    output = (logits,) + outputs[2:]
    return ((loss,) + output) if loss is not None else output

    

In [None]:
if torch.cuda.is_available():     
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

config = BertConfig()
model = BertEnsemble(config)
model.to(device)

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


BertEnsemble(
  (bert_model_1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [None]:
train_loss, val_loss = [], []
train_acc, val_acc = [], []
train_f1, val_f1 = [], []

In [None]:
FINE_TUNING = True
if FINE_TUNING:
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
                                  {'params' : [p for n,p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate' : 0.01},
                                  {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}]
else:
  param_optimizer = list(model.classifier.named_parameters())
  optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

In [None]:
from transformers import get_linear_schedule_with_warmup
epochs = 4
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
#Training code

train_loss, val_loss = [], []
train_acc, val_acc = [], []
train_f1, val_f1 = [], []

l = 100
criterion = nn.CrossEntropyLoss(weight=class_weights)
criterion = criterion.to(device)
for epoch in range(epochs):
  print(f"Epoch {epoch}")
  model.train()
  t_loss, t_acc = 0, 0
  predictions, true_labels = [], []
  for step, batch in enumerate(tqdm(train_dataloader, desc='Train')):
    batch = tuple(t.to(device) for t in batch)
    b_input_id, b_input_mask, b_labels = batch
    model.zero_grad()
    outputs = model(b_input_id, token_type_ids=None, attention_mask=b_input_mask, labels = b_labels)
    # loss = outputs[0]

    # Code for using weighted cross-entropy loss
    active_loss = b_input_mask.view(-1) == 1
    active_logits = outputs[1].view(-1, 2)
    active_labels = torch.where(active_loss, b_labels.view(-1), torch.tensor(criterion.ignore_index).type_as(b_labels))
    loss = criterion(active_logits, active_labels)

    loss.backward()
    t_loss += loss.item()
    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
    optimizer.step()
    scheduler.step()

    logits = outputs[1].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)
    
  print(f"Train Loss : {t_loss/len(train_dataloader)}")
  train_loss.append(t_loss/len(train_dataloader))
  pred_tags = [[p_i for p_i, l_i in zip(p,l) if l_i!=2] for p,l in zip(predictions, true_labels)]
  pred_tags = [p_i for p in pred_tags for p_i in p[1:-1]]
  # pred_tags = [p_i for p, l in zip(predictions, true_labels)
  #                               for p_i, l_i in zip(p, l) if l_i != 2]
  # valid_tags = [l_i for l in true_labels
  #                               for l_i in l if l_i != 2]
  valid_tags = [[l_i for l_i in l if l_i!=2] for l in true_labels]
  valid_tags = [l_i for l in valid_tags for l_i in l[1:-1]]
  train_acc.append(accuracy_score(pred_tags, valid_tags))
  train_f1.append(f1_score(pred_tags, valid_tags))
  print("Train Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
  print("Train F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
  print()

  model.eval()
  v_loss, v_accuracy = 0, 0
  predictions , true_labels = [], []
  for batch in tqdm(val_dataloader, desc="Val"):
    batch = tuple(t.to(device) for t in batch)
    b_input_id, b_input_mask, b_labels = batch
    with torch.no_grad():
      outputs = model(b_input_id, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    
    # loss = outputs[0]

    active_loss = b_input_mask.view(-1) == 1
    active_logits = outputs[1].view(-1, 2)
    active_labels = torch.where(active_loss, b_labels.view(-1), torch.tensor(criterion.ignore_index).type_as(b_labels))
    loss = criterion(active_logits, active_labels)
    
    logits = outputs[1].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    v_loss += loss.item()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)
  
  v_loss = v_loss/len(val_dataloader)
  val_loss.append(v_loss)
  if(v_loss < l):
    l = v_loss
    print("Model Checkpoint")
  torch.save(model, f'drive/MyDrive/Ensemble/ensemble{epoch}.pt')
  
  print(f"Validation Loss : {v_loss}")
  pred_tags = [[p_i for p_i,l_i in zip(p,l) if l_i!=2] for p,l in zip(predictions, true_labels)]
  pred_tags = [p_i for p in pred_tags for p_i in p[1:-1]]
  # pred_tags = [p_i for p, l in zip(predictions, true_labels)
  #                               for p_i, l_i in zip(p, l) if l_i != 2]
  # valid_tags = [l_i for l in true_labels
  #                               for l_i in l if l_i != 2]
  valid_tags = [[l_i for l_i in l if l_i!=2] for l in true_labels]
  valid_tags = [l_i for l in valid_tags for l_i in l[1:-1]]
  print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
  print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
  val_acc.append(accuracy_score(pred_tags, valid_tags))
  val_f1.append(f1_score(pred_tags, valid_tags))
  print()

Train:   0%|          | 0/1588 [00:00<?, ?it/s]

Epoch 0


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
Train: 100%|██████████| 1588/1588 [22:12<00:00,  1.19it/s]


Train Loss : 0.5842492848691021
Train Accuracy: 0.7976126501330038


Val:   0%|          | 0/199 [00:00<?, ?it/s]

Train F1-Score: 0.31373710224813794



Val: 100%|██████████| 199/199 [00:55<00:00,  3.61it/s]


Model Checkpoint
Validation Loss : 0.5077500883209046


Train:   0%|          | 0/1588 [00:00<?, ?it/s]

Validation Accuracy: 0.7726264479557466
Validation F1-Score: 0.34513762915056395

Epoch 1


Train: 100%|██████████| 1588/1588 [22:11<00:00,  1.19it/s]


Train Loss : 0.48944597030222714
Train Accuracy: 0.8624452535131795


Val:   0%|          | 0/199 [00:00<?, ?it/s]

Train F1-Score: 0.4214169668715123



Val: 100%|██████████| 199/199 [00:55<00:00,  3.61it/s]


Validation Loss : 0.5736969535299881


Train:   0%|          | 0/1588 [00:00<?, ?it/s]

Validation Accuracy: 0.8973628720869732
Validation F1-Score: 0.45236703682057283

Epoch 2


Train: 100%|██████████| 1588/1588 [22:10<00:00,  1.19it/s]


Train Loss : 0.4602065501729923
Train Accuracy: 0.8745096326947362


Val:   0%|          | 0/199 [00:00<?, ?it/s]

Train F1-Score: 0.45619805842199485



Val: 100%|██████████| 199/199 [00:55<00:00,  3.60it/s]


Validation Loss : 0.5189071446657181


Train:   0%|          | 0/1588 [00:00<?, ?it/s]

Validation Accuracy: 0.8764137251143304
Validation F1-Score: 0.4370712236497443

Epoch 3


Train: 100%|██████████| 1588/1588 [22:13<00:00,  1.19it/s]


Train Loss : 0.43500436465455317
Train Accuracy: 0.8872860520729774


Val:   0%|          | 0/199 [00:00<?, ?it/s]

Train F1-Score: 0.49184597447040473



Val: 100%|██████████| 199/199 [00:55<00:00,  3.60it/s]


Validation Loss : 0.5732098550315777
Validation Accuracy: 0.8699783662403812
Validation F1-Score: 0.4286401925391095



In [None]:
import pickle
path ='drive/MyDrive/Ensemble/processedtest.pkl'
with open(path,'rb') as f:
  final_data = pickle.load(f)

In [None]:
use_tokens=final_data['token_final'].tolist()
use_spans=final_data['span_final'].tolist()
use_text=final_data['text'].tolist()

In [None]:
class SpanAvgF1():
  def __init__(self, model, tokenizer, X, spans, maxlen=500):
    self.model = model
    self.model.cuda()
    self.tokenizer = tokenizer
    self.maxlen = maxlen
    self.X = X
    self.spans = spans
    
    self.prepare_data()

    self.X = torch.tensor(self.X)
    self.attention_mask = torch.tensor(self.attention_mask)

    data = TensorDataset(self.X, self.attention_mask)
    self.dataloader = DataLoader(data, batch_size=16, shuffle=False)
  
  def tokenize_data(self, x, s):
    sentence = []
    spans = []
    for i in range(len(x)):
      word = x[i]
      # label = y[i]
      tokenized_word = self.tokenizer.tokenize(word)
      sentence.extend(tokenized_word)
      curr = s[i][0]
      spans.append([curr, curr+len(tokenized_word[0])])
      curr += len(tokenized_word[0])
      for j in range(len(tokenized_word)-1):
        spans.append([curr, curr+len(tokenized_word[j+1])-2])
        curr += len(tokenized_word[j+1])-2
      spans[-1][-1] = s[i][1]

    return(sentence, spans)

  def get_attention_mask(self, x):
    return([[(i!=0) for i in text] for text in x])

  def prepare_data(self):
    for i in range(len(self.X)):
      self.X[i], self.spans[i] = self.tokenize_data(self.X[i], self.spans[i])
    self.X = pad_sequences([tokenizer.encode(text) for text in self.X], maxlen = self.maxlen, dtype='long', value=0.0, truncating='post', padding = 'post')
    # self.y = pad_sequences(self.y, maxlen=self.maxlen, value=2, dtype='long', truncating='post', padding='post')
    self.attention_mask = self.get_attention_mask(self.X)
  
  def get_text_lengths(self, masks):
    lengths = []
    for mask in masks:
      lengths.append(torch.sum(mask).item())
    return(lengths)

  def evaluate(self):
    predictions , true_labels = [], []
    for batch in self.dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_id, b_input_mask = batch
      # b_input_id, b_input_mask, b_labels = batch
      self.model.eval()

      with torch.no_grad():
          outputs = self.model(b_input_id, token_type_ids=None, attention_mask=b_input_mask)
      logits = outputs[0].detach().cpu().numpy()
      predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
      true_labels.extend(b_input_mask)
    
    self.pred_tags = [[p_i for p_i, l_i in zip(p, l) if l_i != 0][1:-1] for p, l in zip(predictions, true_labels)]
    # else:
    #   self.pred_tags = predictions
    # self.pred_tags = predictions
    return self.pred_tags

In [None]:
X = use_tokens.copy()
spans = use_spans.copy()


In [None]:
model_path='drive/MyDrive/Ensemble/ensemble1.pt'
modelorg = torch.load(model_path, map_location=torch.device('cpu'))
# model= modelorg
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)

In [None]:
model=modelorg
metric = SpanAvgF1(model, tokenizer, X, spans)
out=metric.evaluate()

In [None]:
predicted_spans=[]
for i in range(len(spans)):
  s = [0 for k in range(len(use_text[i]))]
  prev = 0
  for j in range(len(spans[i])):
    for k in range(spans[i][j][0], spans[i][j][1]):
      s[k] = out[i][j]
    if(prev==1 and out[i][j]==1):
      for l in range(spans[i][j-1][1], spans[i][j][0]):
        s[l] = 1
    prev = out[i][j]
  predicted_spans.append(s)
sp1=predicted_spans

In [None]:
final_offsets=[]
for x in predicted_spans:
  lis=[]
  for i in range(len(x)):
    if x[i]==1:
      lis.append(i)
  final_offsets.append(lis)

In [None]:
save_path = 'drive/MyDrive/Ensemble/predicted.csv'
data = {'spans':final_offsets,'text':use_text}
 
df = pd.DataFrame(data)

df.to_csv(save_path,index=None)

In [None]:
test_path = 'drive/MyDrive/Ensemble/testfinal.csv' # enter the path to csv where the test spans are available 
pred_path='drive/MyDrive/Ensemble/predicted.csv'  # enter the path where is predicted.csv

import pandas as pd
import pickle

from ast import literal_eval

test = pd.read_csv(test_path)
test.spans = test.spans.apply(literal_eval)

prediction = pd.read_csv(pred_path)
prediction.spans = prediction.spans.apply(literal_eval)


spans = test.spans.tolist() #list of lists
predicted = prediction.spans.tolist() #list of lists 


#set1: predicted, set2: ground

def precision_(set1, set2):
    return len(set1.intersection(set2))/len(set1)

def recall_(set1, set2):
    return len(set1.intersection(set2))/len(set2)

def f1_(set1, set2):
    if (len(set1) == 0 and len(set2) == 0):
        return 1
    elif (len(set1) == 0 or len(set2) == 0):
        return 0
    else:
        p = precision_(set1, set2)
        r = recall_(set1, set2)
        if (p+r) == 0:
            return 0
        return  2*p*r/(p+r) 

f1s = []
for i in range(len(spans)):
    f1s.append(f1_(set(predicted[i]), set(spans[i])))

print(sum(f1s)/len(f1s))

0.5798660102046644
