<a href="https://colab.research.google.com/github/alexandrosXe/context_toxicity/blob/master/Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Bert Model for Toxicity Detection**

# Install pkbar

In [50]:
!pip install pkbar



# Setup. Download the  Transformers library by Hugging Face:

In [0]:
!pip install -qq transformers

In [0]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import pkbar
from sklearn.metrics import *
import torch.autograd
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import WeightedRandomSampler
%matplotlib inline

In [0]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
#tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Early Stopping

In [0]:
class EarlyStopping:
  def __init__(self, patience=7, verbose=False, delta=0):
    self.patience = patience
    self.verbose = verbose
    self.counter = 0
    self.best_score = None
    self.early_stop = False
    self.val_loss_min = np.Inf
    self.delta = delta

  def __call__(self, val_loss, model):

    score = val_loss

    if self.best_score is None:
      self.best_score = score
      self.save_checkpoint(val_loss, model)
    elif score <= self.best_score + self.delta:
      self.counter += 1
      print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
      if self.counter >= self.patience:
        self.early_stop = True
    else:
      self.best_score = score
      self.save_checkpoint(val_loss, model)
      self.counter = 0

  def save_checkpoint(self, val_loss, model):
    if self.verbose:
      print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
      torch.save(model.state_dict(), 'checkpoint.pt')
      self.val_loss_min = val_loss

# Create a PyTorch dataset

In [0]:
class Toxic_Detection_Dataset(Dataset):
  def __init__(self, comments, targets, tokenizer, max_len):
    self.comments = comments
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.comments)

    
  def __getitem__(self, item):
    comment = str(self.comments[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      comment,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'comment_text': comment,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

# **Bert MLP**

In [0]:
class BERT_MLP(nn.Module):
  def __init__(self,
                 trainable_layers=3,
                 max_seq_length=128,
                 show_summary=False,
                 label_list=[0, 1],
                 patience=3,
                 seed=42,
                 epochs=100,
                 save_predictions=False,
                 batch_size=32,
                 DATA_COLUMN="text",
                 LABEL_COLUMN="label",
                 DATA2_COLUMN=None,
                 lr=2e-05,
                 session=None,
                 loss=nn.BCELoss()
                 ):
    super(BERT_MLP, self).__init__()
    self.name = f'{"OOC1" if not DATA2_COLUMN else "OOC2"}-b{batch_size}.e{epochs}.len{max_seq_length}.bert'
    self.tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.lr = lr
    self.batch_size = batch_size
    self.DATA_COLUMN=DATA_COLUMN
    self.DATA2_COLUMN=DATA2_COLUMN
    self.LABEL_COLUMN=LABEL_COLUMN
    self.trainable_layers = trainable_layers
    self.max_seq_length = max_seq_length
    self.show_summary = show_summary
    self.label_list = label_list
    self.patience=patience
    self.save_predictions = save_predictions
    self.epochs = epochs
    self.loss = loss

    #Layers
    self.bert = BertModel.from_pretrained('bert-base-cased') # (PRE_TRAINED_MODEL_NAME)
    self.dense = nn.Linear(self.bert.config.hidden_size,128)
    #self.denseBn = nn.BatchNorm1d(128)
    self.tanh=nn.Tanh()
    self.output = nn.Linear(128,1)
    self.sigmoid=nn.Sigmoid()

    #if possible run in GPU else in CPU
    if torch.cuda.is_available():
      self.device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc. 
      print("Running on the GPU")
    else:
      self.device = torch.device("cpu")
      print("Running on the CPU")

  def forward(self, input_ids, attention_mask):
    # Get output from bert decoder 
    cont_reps, pooled_output = self.bert(input_ids=input_ids,attention_mask=attention_mask)
    #cls_rep = cont_reps[:, 0]
    # feed it to dense NN 
    #output = self.dense(cls_rep)
    output = self.dense(pooled_output)
    #output=self.densebn(output)
    output = self.tanh(output)
    output = self.output(output)
    output = self.sigmoid(output)
    #print(output)
    return output.squeeze()

  def fit(self, train, val,optimizer,bert_weights=None): #pretrained_embeddings
    #Counter class imbalance by setting output layer bias to log(T/N)
    pos = sum(train.label)
    neg = len(train.label)-pos
    bias = np.log(pos/neg)
    print ("BIAS:", bias)
    bias=torch.tensor(bias)
    with torch.no_grad():
      self.output.bias=torch.nn.Parameter(bias.to(torch.float)) # set bias of last dense layer log(T/N)
    self.output.bias.requires_grad_(False)
    early_stopping = EarlyStopping(patience=self.patience, verbose=True)     #EARLYY STOPING
    self.to(self.device)
    i=1 #for progress bar
    for epoch in range(self.epochs):
      print('\nEpoch: %d/%d' % (epoch + 1, self.epochs))
      kbar = pkbar.Kbar(target=self.epochs, width=10)
      kbar_val=pkbar.Kbar(target=self.epochs, width=10)
      epoch_loss,epoch_Auc,epoch_accuracy=self.trainin(train,optimizer)
      kbar.update(i, values=[("loss",epoch_loss), ("accuracy",epoch_accuracy),("AUC_score",epoch_Auc)])  #("precision",epoch_precision),("recall",epoch_recall),("F1",epoch_F1)])
      # val_loss,val_AUC_score,val_accuracy,val_recall,val_precision,val_F1=self.evaluate(val)
      val_loss,val_AUC_score,val_accuracy=self.evaluate(val)
      print("\n Val auc score in epoch ",epoch+1, ":",val_AUC_score)
      early_stopping(val_AUC_score,self)   
      if early_stopping.early_stop: #check for early stopping
        print("Early stopping")
        print("_________________________________________________-")
        break
      kbar_val.update(i,values=[("val_loss",val_loss), ("val_accuracy",val_accuracy),("val_AUC_score",val_AUC_score)])#,("val_precision",val_precision),("val_recall",val_recall),("val_F1",val_F1)])
      i+=1

  def trainin(self,train,optimizer):
    #Create Data Loader for mini batch Training

    #UnderSampling
    majority_weight = 1/(6000-59)
    minority_weight = 1/59
    sample_weights = np.array([majority_weight, minority_weight])
    weights = sample_weights[train.label]
    sampler = WeightedRandomSampler(weights=weights, num_samples=len(weights), replacement=True)
    train_ds = Toxic_Detection_Dataset(comments=train.text.to_numpy(),targets=train.label.to_numpy(),tokenizer=self.tokenizer,max_len=self.max_seq_length)
    # train_dl=DataLoader(train_ds,batch_size=self.batch_size,shuffle=True)
    train_dl=DataLoader(train_ds,batch_size=self.batch_size,sampler=sampler)
    epoch_loss=0
    correct_predictions=0
    Y=torch.empty(0) #create empty torch to append predictions
    self.train()
    for d in train_dl:
      #Get Bert inputs 
      input_ids = d["input_ids"].to(self.device)
      attention_mask = d["attention_mask"].to(self.device)
      targets = d["targets"].to(self.device)
      outputs = self(input_ids=input_ids,attention_mask=attention_mask)
      weight = torch.tensor([0.1, 0.9]).to(self.device)
      weight_ = weight[targets.data.view(-1).long()].view_as(targets)
      criterion = nn.BCELoss(reduction='none')  #(reduce=False)
      loss = criterion(outputs, targets.to(torch.float))
      loss_class_weighted = loss * weight_
      loss_class_weighted = loss_class_weighted.mean()
      loss=loss_class_weighted
      #loss = self.loss(outputs, targets.to(torch.float))
      epoch_loss+=loss.item()
      #losses.append(loss.item())
      Y=torch.cat((Y,outputs.cpu()),0)
      loss.backward()
      nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
      optimizer.step()
      optimizer.zero_grad()
    with torch.no_grad():
      predictions=Y
      rounded=torch.round(predictions).cpu().detach().numpy()
      predictions=predictions.cpu().detach().numpy()
      epoch_Auc=roc_auc_score(train.label.to_numpy(),predictions)
      epoch_Accuracy=accuracy_score(train.label.to_numpy(),rounded)
    return epoch_loss,epoch_Auc,epoch_Accuracy
  
  #Validation
  def evaluate(self,val):
    val_ds = Toxic_Detection_Dataset(comments=val.text.to_numpy(),targets=val.label.to_numpy(),tokenizer=self.tokenizer,max_len=self.max_seq_length)
    val_dl=DataLoader(val_ds,batch_size=self.batch_size,shuffle=True)
    val_loss=0
    correct_predictions=0
    Y=torch.empty(0) #create empty torch to append predictions
    self.eval()
    with torch.no_grad():   # compute validation loss
      for d in val_dl:
        input_ids = d["input_ids"].to(self.device)
        attention_mask = d["attention_mask"].to(self.device)
        targets = d["targets"].to(self.device)
        outputs = self(input_ids=input_ids,attention_mask=attention_mask)
        weight = torch.tensor([0.1, 0.9]).to(self.device)
        weight_ = weight[targets.data.view(-1).long()].view_as(targets)
        criterion = nn.BCELoss(reduction='none')  #(reduce=False)
        loss = criterion(outputs, targets.to(torch.float))
        loss_class_weighted = loss * weight_
        loss_class_weighted = loss_class_weighted.mean()
        loss=loss_class_weighted
        Y=torch.cat((Y,outputs.cpu()),0)
        #loss = self.loss(outputs, targets.to(torch.float))
        val_loss+=loss.item()
    predictions=Y
    rounded=torch.round(predictions).cpu().detach().numpy()
    predictions=predictions.cpu().detach().numpy()
    val_Auc=roc_auc_score(val.label.to_numpy(),predictions)
    val_Accuracy=accuracy_score(val.label.to_numpy(),rounded)
    return val_loss,val_Auc,val_Accuracy

  #predict on test data
  def predict(self,test):
    test_ds = Toxic_Detection_Dataset(comments=test.text.to_numpy(),targets=test.label.to_numpy(),tokenizer=self.tokenizer,max_len=self.max_seq_length)
    test_dl=DataLoader(test_ds,batch_size=self.batch_size,shuffle=True)
    Y=torch.empty(0) #create empty torch to append predictions
    self.eval()
    with torch.no_grad():   # compute validation loss
      for d in test_dl:
        input_ids = d["input_ids"].to(self.device)
        attention_mask = d["attention_mask"].to(self.device)
        targets = d["targets"].to(self.device)
        outputs = self(input_ids=input_ids,attention_mask=attention_mask)
        Y=torch.cat((Y,outputs.cpu()),0)
    predictions=Y
    return predictions 

  def Unfreeze_Last_K_Layers(self,k=3):
    ct = 0
    #Bert layers
    for child in self.bert.children():
      ct += 1
      if (ct <= (12-k)):
        #Bert's layer's parameters
        for param in child.parameters():
          param.requires_grad = False



#Load Data

In [0]:
dataN=pd.read_csv("dataset/oc.csv",header=0)
dataC=pd.read_csv("dataset/wc.csv",header=0)

# 5-fold MC Validation

In [0]:
def MC_Validation(dataset,k=5):
  avgscore=0
  for i in range(k):
    model= BERT_MLP(epochs=20)

    optimizer=optim.Adam(model.parameters(), lr=model.lr)

    # train @N models
    X_train, X_test = train_test_split(dataset, test_size=0.4,random_state=i)
    #X_train, X_val = train_test_split(X_train,test_size=0.25,random_state=i) # 0.25 x 0.8 = 0.2
    XC_train,X_test=train_test_split(dataC, test_size=0.2,random_state=i) # test dataset with C to train and test
    XC_train,X_val= train_test_split(XC_train,test_size=0.25,random_state=i) # 0.25 x 0.8 = 0.2

    # train @C models
    # X_train, X_test = train_test_split(dataset, test_size=0.4,random_state=i)
    # #X_train, X_val = train_test_split(X_train,test_size=0.25,random_state=i) # 0.25 x 0.8 = 0.2
    # X_val,X_test=train_test_split(X_test, test_size=0.5,random_state=i) # test dataset with C to train and test

    # Perform MC Validation
    model.fit(X_train,X_val,optimizer)  #bert_weights="bert_weights.h5"
    preds=model.predict(X_test)#X_test.text.to_numpy())
    preds=preds.cpu().detach().numpy()
    gold=X_test.label
    print("\n__________________________________\n") 
    score = roc_auc_score(gold, preds)
    print("AUC score in ",i+1," fold ",score)
    print("\n__________________________________") #to see results
    avgscore+=score
  avgscore/=k
  return avgscore

In [0]:
print("Average AUC score over 5fold MC validation is ",MC_Validation(dataset=dataN,k=5)) 

# Stratified MC validation 

In [82]:
#Make stratified split in @N dataset for train data

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.4, random_state=0)
X=dataN.text.to_numpy()
y=dataN.label.to_numpy()
train=list(sss.split(X,y))

#Make stratified split in @C dataset for val and test data
sss_Val = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=3)
X_val=dataC.text.to_numpy()
y_val=dataC.label.to_numpy()
val_and_test=list(sss_Val.split(X_val,y_val))
train_index,test_index = val_and_test[0]
X_train, X_val_test = X_val[train_index], X_val[test_index]
y_train, y_val_test = y_val[train_index], y_val[test_index]

# Now make stratified split in 40% of C dataset for val(20%) and test(20%)
sss_Val= StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
val_and_test=list( sss_Val.split(X_val_test,y_val_test))
avgscore=0
for i in range(5):
  train_index, _ = train[i]
  val_index , test_index = val_and_test[i]
  X_train,y_train=X[train_index],y[train_index]
  X_val,y_val=X_val_test[val_index],y_val_test[val_index]
  X_test,y_test=X_val_test[test_index],y_val_test[test_index]
  
  # Create X_train , X_val and X_test Dataframes
  X_train=pd.DataFrame({'text': X_train,'label': y_train})
  X_val=pd.DataFrame({'text': X_val,'label': y_val})
  X_test=pd.DataFrame({'text': X_test,'label': y_test})
  #print(X_train.loc[X_train['label']==1].shape)
  # print(X_val.shape)
  # print(X_test.shape)
  model= BERT_MLP(epochs=20)
  model.Unfreeze_Last_K_Layers(k=3)
  optimizer=optim.Adam(model.parameters(), lr=model.lr)
  model.fit(X_train,X_val,optimizer)  #bert_weights="bert_weights.h5"
  preds=model.predict(X_test)
  preds=preds.cpu().detach().numpy()
  gold=X_test.label.to_numpy()
  print("\n__________________________________\n") 
  score = roc_auc_score(gold, preds)
  print("AUC score in ",i+1," fold ",score)
  print("\n__________________________________") #to see results
  avgscore+=score
result=avgscore/5
print("Average AUC score over 5fold MC validation is ",result) 

Running on the GPU
BIAS: -5.138316273042602

Epoch: 1/20
 1/20 [..........] - ETA: 28:12 - loss: 135.7944 - accuracy: 0.9797 - AUC_score: 0.4942
 Val auc score in epoch  1 : 0.5787478849407783
Validation loss decreased (inf --> 0.578748).  Saving model ...
 1/20 [..........] - ETA: 37:42 - val_loss: 4.9042 - val_accuracy: 0.3925 - val_AUC_score: 0.5787
Epoch: 2/20
 2/20 [>.........] - ETA: 13:18 - loss: 41.3832 - accuracy: 0.0280 - AUC_score: 0.5108
 Val auc score in epoch  2 : 0.542258883248731
EarlyStopping counter: 1 out of 3
 2/20 [>.........] - ETA: 17:37 - val_loss: 7.8768 - val_accuracy: 0.0150 - val_AUC_score: 0.5423
Epoch: 3/20
 3/20 [>.........] - ETA: 8:21 - loss: 34.8376 - accuracy: 0.0058 - AUC_score: 0.5010
 Val auc score in epoch  3 : 0.46881556683587144
EarlyStopping counter: 2 out of 3
 3/20 [>.........] - ETA: 11:04 - val_loss: 9.5162 - val_accuracy: 0.0150 - val_AUC_score: 0.4688
Epoch: 4/20
 4/20 [=>........] - ETA: 5:54 - loss: 33.1883 - accuracy: 0.0058 - AUC_scor

KeyboardInterrupt: ignored

In [0]:
#Get toxic comments and give them always to train set
toxics=dataN.loc[dataN['label']==1]
Non_toxics=dataN[dataN.label != 1]
Non_toxics=dataN[0:59]
#print(Non_toxics.shape)
# Non_toxics=dataN.loc[dataN['label']==0]
# Non_toxics=Non_toxics[0:100]
# #print(Non_toxics.label.head(100))
frames = [toxics,Non_toxics]
data = pd.concat(frames)
print(data.shape)


(118, 4)


In [0]:
avgscore=0
for i in range(5):
  model=BERT_MLP(epochs=20)
  model.Unfreeze_Last_K_Layers(k=3)
  optimizer=optim.Adam(model.parameters(), lr=model.lr)
  X_train, X_val = train_test_split(data, test_size=0.2,random_state=i)
  model.fit(data,X_val,optimizer)  #bert_weights="bert_weights.h5"
  preds=model.predict(data)
  preds=preds.cpu().detach().numpy()
  #preds=torch.round(preds).cpu().detach().numpy()
  gold=data.label.to_numpy()
  print("\n__________________________________\n") 
  score = roc_auc_score(gold, preds)
  print("AUC score in ",i+1," fold ",score)
  print("\n__________________________________") #to see results
  avgscore+=score
result=avgscore/5
print("Average AUC score over 5fold MC validation is ",result) 