<a href="https://colab.research.google.com/github/architb1703/Toxic_Span/blob/archit/Span_Avg_F1_Metric.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers==2.6.0
!pip install seqeval
!pip install urllib3 --upgrade

Collecting transformers==2.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/4c/a0/32e3a4501ef480f7ea01aac329a716132f32f7911ef1c2fac228acc57ca7/transformers-2.6.0-py3-none-any.whl (540kB)
[K     |▋                               | 10kB 16.9MB/s eta 0:00:01[K     |█▏                              | 20kB 18.5MB/s eta 0:00:01[K     |█▉                              | 30kB 13.9MB/s eta 0:00:01[K     |██▍                             | 40kB 13.4MB/s eta 0:00:01[K     |███                             | 51kB 11.3MB/s eta 0:00:01[K     |███▋                            | 61kB 9.6MB/s eta 0:00:01[K     |████▎                           | 71kB 10.6MB/s eta 0:00:01[K     |████▉                           | 81kB 11.7MB/s eta 0:00:01[K     |█████▌                          | 92kB 11.1MB/s eta 0:00:01[K     |██████                          | 102kB 10.8MB/s eta 0:00:01[K     |██████▋                         | 112kB 10.8MB/s eta 0:00:01[K     |███████▎                    

In [2]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm, trange

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchtext import data
import transformers
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup

from keras.preprocessing.sequence import pad_sequences
from seqeval.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix

torch.manual_seed(42)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



In [5]:
#Load data
train_path = '/content/drive/My Drive/ToxicSpan_CS669V/BERT_Preprocess/train.pkl'
val_path = '/content/drive/My Drive/ToxicSpan_CS669V/BERT_Preprocess/val.pkl'

with open(train_path, 'rb') as f:
  train_data = pickle.load(f)
  f.close()

with open(val_path, 'rb') as f:
  val_data = pickle.load(f)
  f.close()

In [6]:
#Class to take in tokenizer and model along with input data and calculate the span average f1 score
class SpanAvgF1():
  def __init__(self, model, tokenizer, X, y, spans, target_spans):
    self.model = model
    self.model.cuda()
    self.tokenizer = tokenizer
    self.X = X
    self.y = y
    self.spans = spans
    self.target_spans = target_spans
    self.prepare_data()

    self.X = torch.tensor(self.X)
    self.y = torch.tensor(self.y)
    self.attention_mask = torch.tensor(self.attention_mask)

    data = TensorDataset(self.X, self.attention_mask, self.y)
    self.dataloader = DataLoader(data, batch_size=16, shuffle=False)

    self.evaluate()
  
  def tokenize_data(self, x, y, s):
    sentence = []
    labels = []
    spans = []
    for i in range(len(x)):
      word = x[i]
      label = y[i]
      tokenized_word = self.tokenizer.tokenize(word)
      sentence.extend(tokenized_word)
      labels.extend([label for k in range(len(tokenized_word))])
      curr = s[i][0]
      spans.append([curr, curr+len(tokenized_word[0])])
      curr += len(tokenized_word[0])
      for j in range(len(tokenized_word)-1):
        spans.append([curr, curr+len(tokenized_word[j+1])-2])
        curr += len(tokenized_word[j+1])-2
      spans[-1][-1] = s[i][1]
    return(sentence, labels, spans)

  def get_attention_mask(self, x):
    return([[(i!=0) for i in text] for text in x])

  def prepare_data(self):
    for i in range(len(self.X)):
      self.X[i], self.y[i], self.spans[i] = self.tokenize_data(self.X[i], self.y[i], self.spans[i])
    self.X = pad_sequences([tokenizer.convert_tokens_to_ids(text) for text in self.X], maxlen = 500, dtype='long', value=0.0, truncating='post', padding = 'post')
    self.y = pad_sequences(self.y, maxlen=500, value=2, dtype='long', truncating='post', padding='post')
    self.attention_mask = self.get_attention_mask(self.X)
  
  def evaluate(self):
    predictions , true_labels = [], []
    for batch in self.dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_id, b_input_mask, b_labels = batch
      self.model.eval()
      with torch.no_grad():
        outputs = self.model(b_input_id, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      
      logits = outputs[1].detach().cpu().numpy()
      predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
      true_labels.extend(b_labels)

    self.pred_tags = [[p_i for p_i, l_i in zip(p, l) if l_i != 2] for p, l in zip(predictions, true_labels)]

  def f1score(self):
    self.f1list = []
    self.predicted_spans = []
    f1 = 0
    for i in range(len(self.spans)):
      s = [0 for k in range(self.spans[i][-1][1])]
      prev = 0
      for j in range(len(self.spans[i])):
        for k in range(self.spans[i][j][0], self.spans[i][j][1]):
          s[k] = self.pred_tags[i][j]
        if(prev==1 and self.pred_tags[i][j]==1):
          for k in range(self.spans[i][j-1][1], self.spans[i][j][0]):
            s[k] = 1
        prev = self.pred_tags[i][j]
      self.f1list.append(f1_score(self.target_spans[i], s))
      f1 += f1_score(self.target_spans[i], s, zero_division=1)
      self.predicted_spans.append(s)
    return(f1/len(self.X))

In [None]:
X = val_data['token_final'].values
Y = val_data['target_final'].values
spans = val_data['span_final'].values
target_spans = []
for i in range(len(X)):
  s = [0 for j in range(spans[i][-1][-1])]
  for k in val_data['spans'][i]:
    s[k] = 1
  target_spans.append(s)

In [None]:
model = torch.load('/content/drive/My Drive/bert_base_cased_best.pt', map_location=torch.device('cpu'))
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)

In [None]:
metric = SpanAvgF1(model, tokenizer, X, Y, spans, target_spans)
metric.f1score()

  average, "true nor predicted", 'F-score is', len(true_sum)


0.6571616805131474

In [None]:
val_data['token_final']

0      [Fuck, ##ing, Left, ##ist, He, ##bes, ,, alway...
1      [Because, plants, are, D, ##AN, ##GE, ##RO, ##...
2      [Their, is, so, much, additional, garbage, tag...
3      [Are, there, really, enough, red, neck, idiot,...
4      [Good, points, A, dumb, crude, guy, in, a, dum...
                             ...                        
789    [It, never, passes, comment, review, but, I, g...
790    [I, was, n, ', t, there, and, support, it, %, ...
791    [funny, how, these, churches, want, to, protec...
792    [Typical, lying, protest, ##or, They, ex, ##ag...
793    [He, trans, ##cend, ##ed, the, ', civil, ##iti...
Name: token_final, Length: 794, dtype: object

In [None]:
#Generating csv file to compare ground truth and predicted spans
text = []
truth_text = []
pred_text = []

for i in range(len(val_data['text'])):
  text.append(val_data['text'][i])
  arr = []
  for j,k in enumerate(val_data['target_final'][i]):
    if(k == 1):
      arr.append(val_data['token_final'][i][j])
  truth_text.append(arr)
  arr = []
  x = tokenizer.convert_ids_to_tokens(metric.X[i])
  for j,k in enumerate(metric.pred_tags[i]):
    if(k == 1):
      arr.append(x[j])
  pred_text.append(arr)

In [None]:
df = pd.DataFrame(list(zip(text, truth_text, pred_text)), 
               columns =['Text', 'Ground Truth', 'Predicted'])
df.to_csv('/content/drive/My Drive/ToxicSpan_CS669V/processed/bert_base_cased_best_answer.csv')