<a href="https://colab.research.google.com/github/architb1703/Toxic_Span/blob/archit/Span_Avg_F1_Metric.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers==2.6.0
!pip install seqeval
!pip install urllib3 --upgrade
!pip install pytorch-crf

Collecting transformers==2.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/4c/a0/32e3a4501ef480f7ea01aac329a716132f32f7911ef1c2fac228acc57ca7/transformers-2.6.0-py3-none-any.whl (540kB)
[K     |████████████████████████████████| 542kB 12.1MB/s 
[?25hCollecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/2f/f5/aeb4d65266f7712a627674bd19994cee3e1c66ff588adbc4db3fc0bbbf97/boto3-1.16.34-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 38.7MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 56.7MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl

In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm, trange

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchtext import data
import transformers
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, AdamW, BertModel

from torchcrf import CRF
from transformers import get_linear_schedule_with_warmup

from keras.preprocessing.sequence import pad_sequences
from seqeval.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix

torch.manual_seed(12)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Load data
train_path = '/content/drive/MyDrive/ToxicSpan_CS669V/BERT_Preprocess/train.pkl'
val_path = '/content/drive/MyDrive/ToxicSpan_CS669V/BERT_Preprocess/val.pkl'

# train_path = '/content/drive/MyDrive/ToxicSpan_CS669V/processed/finaltrain.pkl'
# val_path = '/content/drive/MyDrive/ToxicSpan_CS669V/processed/finaldev.pkl'

with open(train_path, 'rb') as f:
  train_data = pickle.load(f)
  f.close()

with open(val_path, 'rb') as f:
  val_data = pickle.load(f)
  f.close()

In [None]:
#Class to take in tokenizer and model along with input data and calculate the span average f1 score
class SpanAvgF1():
  def __init__(self, model, tokenizer, X, y, spans, target_spans, maxlen=500):
    self.model = model
    self.model.cuda()
    self.tokenizer = tokenizer
    self.maxlen = maxlen
    self.X = X
    self.y = y
    self.spans = spans
    self.target_spans = target_spans
    self.prepare_data()

    self.X = torch.tensor(self.X)
    self.y = torch.tensor(self.y)
    self.attention_mask = torch.tensor(self.attention_mask)

    data = TensorDataset(self.X, self.attention_mask, self.y)
    self.dataloader = DataLoader(data, batch_size=16, shuffle=False)
  
  def tokenize_data(self, x, y, s):
    sentence = []
    labels = [0]
    spans = []
    for i in range(len(x)):
      word = x[i]
      label = y[i]
      tokenized_word = self.tokenizer.tokenize(word)
      sentence.extend(tokenized_word)
      labels.extend([label for k in range(len(tokenized_word))])
      curr = s[i][0]
      spans.append([curr, curr+len(tokenized_word[0])])
      curr += len(tokenized_word[0])
      for j in range(len(tokenized_word)-1):
        spans.append([curr, curr+len(tokenized_word[j+1])-2])
        curr += len(tokenized_word[j+1])-2
      spans[-1][-1] = s[i][1]
    labels.append(0)
    return(sentence, labels, spans)

  def get_attention_mask(self, x):
    return([[(i!=0) for i in text] for text in x])

  def prepare_data(self):
    for i in range(len(self.X)):
      self.X[i], self.y[i], self.spans[i] = self.tokenize_data(self.X[i], self.y[i], self.spans[i])
    self.X = pad_sequences([tokenizer.encode(text) for text in self.X], maxlen = self.maxlen, dtype='long', value=0.0, truncating='post', padding = 'post')
    self.y = pad_sequences(self.y, maxlen=self.maxlen, value=2, dtype='long', truncating='post', padding='post')
    self.attention_mask = self.get_attention_mask(self.X)
  
  def get_text_lengths(self, masks):
    lengths = []
    for mask in masks:
      lengths.append(torch.sum(mask).item())
    return(lengths)

  def evaluate(self, flag):
    predictions , true_labels = [], []
    for batch in self.dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_id, b_input_mask, b_labels = batch
      self.model.eval()
      
      if(flag==1):
        with torch.no_grad():
          outputs = self.model(b_input_id, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs[1].detach().cpu().numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
      
      elif(flag==2):
        with torch.no_grad():
          out = model(b_input_id, b_input_mask)
          logits = torch.argmax(F.softmax(out, dim=2), dim=2).to('cpu').numpy()
          predictions.extend(logits)
      
      else:
        preds = model(b_input_id, b_input_mask, None, True)
        predictions.extend([p for p in preds])

      true_labels.extend(b_labels)
    if(flag!=3):
      self.pred_tags = [[p_i for p_i, l_i in zip(p, l) if l_i != 2][1:-1] for p, l in zip(predictions, true_labels)]
    else:
      self.pred_tags = predictions

  def f1score(self):
    self.f1list = []
    self.predicted_spans = []
    f1 = 0
    for i in range(len(self.spans)):
      s = [0 for k in range(len(val_data['text'][i]))]
      prev = 0
      for j in range(len(self.spans[i])):
        for k in range(self.spans[i][j][0], self.spans[i][j][1]):
          s[k] = self.pred_tags[i][j]
        if(prev==1 and self.pred_tags[i][j]==1):
          for l in range(self.spans[i][j-1][1], self.spans[i][j][0]):
            s[l] = 1
        
        prev = self.pred_tags[i][j]
      self.f1list.append(f1_score(self.target_spans[i], s))
      f1 += f1_score(self.target_spans[i], s, zero_division=1)
      self.predicted_spans.append(s)
    return(f1/len(self.X))

In [None]:
X = val_data['token_final'].values
Y = val_data['target_final'].values
spans = val_data['span_final'].values
target_spans = []
for i in range(len(X)):
  s = [0 for j in range(len(val_data['text'][i]))]
  for k in val_data['spans'][i]:
    s[k] = 1
  target_spans.append(s)

In [None]:
class BiLSTM(nn.Module):
  def __init__(self, bert, input_dim, num_labels, hidden_dim, lstm_layers, dropout):
    super().__init__()
    self.input_dim = input_dim
    self.num_labels = num_labels
    self.hidden_dim = hidden_dim
    self.bert = bert
    self.bilstm = nn.LSTM(input_dim, hidden_dim, lstm_layers, bidirectional=True, dropout=dropout, batch_first=True)
    self.fc = nn.Linear(hidden_dim*2, num_labels)

  def forward(self, inputs, masks, labels=None, bilstm=False):
    outputs = self.bert(inputs, masks)
    seq_out = outputs[0]
    
    seq_out = nn.utils.rnn.pack_padded_sequence(seq_out, torch.tensor([torch.sum(a) for a in masks]), batch_first=True, enforce_sorted=False)
    x, (h_n, c_n) = self.bilstm(seq_out)
    x, _ = nn.utils.rnn.pad_packed_sequence(x, total_length=300, batch_first=True)
    x = self.fc(x)
    return(x)

In [None]:
class BertCRFModel(nn.Module):
  def __init__(self, bert_model, num_labels, bilstm):
    super(BertCRFModel, self).__init__()
    self.bert_model = bert_model
    self.num_labels = num_labels
    self.bert = BertModel.from_pretrained(self.bert_model, output_attentions=False, output_hidden_states=False)
    self.crf = CRF(self.num_labels, batch_first=True)
    self.dropout = nn.Dropout(0.7)
    if(bilstm):
      self.bilstm = nn.LSTM(self.bert.config.hidden_size, self.num_labels, bidirectional=True, num_layers=1, batch_first=True)
      self.fc = nn.Linear(self.num_labels*2, self.num_labels)
    else:
      self.fc = nn.Linear(self.bert.config.hidden_size, self.num_labels)
    
  def forward(self, inputs, masks, labels=None, bilstm=False):
    outputs = self.bert(inputs, masks)
    seq_out = outputs[0]
    
    if(not bilstm):
      x = self.fc(seq_out)
      seq_out = self.dropout(seq_out)
    else:
      seq_out = nn.utils.rnn.pack_padded_sequence(seq_out, torch.tensor([torch.sum(a) for a in masks]), batch_first=True, enforce_sorted=False)
      x, (h_n, c_n) = self.bilstm(seq_out)
      x, _ = nn.utils.rnn.pad_packed_sequence(x, total_length=500, batch_first=True)
      x = self.fc(x)

    masks = masks.type(torch.uint8)
    if(labels is not None):
      loss = -self.crf(F.log_softmax(x, dim=2), labels, mask=masks, reduction='mean')
      preds = self.crf.decode(x, mask=masks)
      return loss, preds
    else:
      preds = self.crf.decode(x, mask=masks)
      return preds

In [None]:
model = torch.load('/content/drive/MyDrive/semi_sup-5.pt', map_location=torch.device('cpu'))
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [None]:
metric = SpanAvgF1(model, tokenizer, X, Y, spans, target_spans, 500)
metric.evaluate(1)
metric.f1score()

  average, "true nor predicted", 'F-score is', len(true_sum)


0.6882755826103494

In [None]:
metric.pred_tags[4], val_data['target_final'][4], val_data['token_final'][4]

([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 ['Good',
  'points',
  'A',
  'dumb',
  'crude',
  'guy',
  'in',
  'a',
  'dumb',
  'crude',
  'sport',
  'That',
  "'",
  's',
  'what',
  'Trump',
  'has',
  'done',
  'to',
  'politics',
  'I'])

In [None]:
df = pd.DataFrame(list(zip(val_data['token_final'], metric.pred_tags)), 
               columns =['token_final', 'target_final'])
df.to_csv('/content/drive/My Drive/ToxicSpan_CS669V/processed/val_semi_new.csv')

In [None]:
#Generating csv file to compare ground truth and predicted spans
text = []
truth_text = []
pred_text = []

for i in range(len(val_data['text'])):
  text.append(val_data['text'][i])
  arr = []
  for j,k in enumerate(val_data['target_final'][i][1:-1]):
    if(k == 1):
      arr.append(val_data['token_final'][i][j])
  truth_text.append(arr)
  arr = []
  # x = tokenizer.convert_ids_to_tokens(metric.X[i])
  for j,k in enumerate(metric.pred_tags[i]):
    if(k == 1):
      arr.append(val_data['token_final'][i][j])
  pred_text.append(arr)

In [None]:
df = pd.DataFrame(list(zip(text, truth_text, pred_text)), 
               columns =['Text', 'Ground Truth', 'Predicted'])
df.to_csv('/content/drive/My Drive/ToxicSpan_CS669V/processed/semi_sup_it3.csv')