# Prepapre Env

In [None]:
!pip install tensorflow_addons
!pip install transformers
!pip install sentencepiece
!pip install torchvision

# Prepare Workspace

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertForSequenceClassification

import pandas as pd
import numpy as np
from tqdm import trange
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
tokenizer_model_name = 'bert-base-uncased'  # 'albert-large-v2', 'albert-base-v2'
WORKSPACE = 'drive/MyDrive/HushUp/OffensiveTextClassifier'
TEXT_MODEL_DIR = WORKSPACE + "/Models/BERT_5"
METADATA_FILE_PATH = WORKSPACE + '/compiled_data.csv'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

# Data Preprocessing

In [None]:
df = pd.read_csv(METADATA_FILE_PATH)
print(df.shape)

In [None]:
y = df['Label']
X = df['Text']

X_train, X_temp, y_train, y_temp = train_test_split(X,y,test_size=0.2, random_state=7, shuffle=True, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp,test_size=0.5, random_state=7, shuffle=True, stratify=y_temp)

In [None]:
print("Train shapes")
print(X_train.shape)
print(y_train.shape)

print("Test shapes")
print(X_test.shape)
print(y_test.shape)

print("Val shapes")
print(X_val.shape)
print(y_val.shape)

In [None]:
def load_data(sentences, labels=pd.Series([])):
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_name) # AlbertTokenizer.from_pretrained(tokenizer_model_name)

    # Add special tokens at the beginning and end of each sentence
    sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    MAX_LEN = 128

    # convert the tokens to their index numbers
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    # Pad our input tokens
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask) 

    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
    
    batch_size = 4  

    if labels.empty:
      prediction_data = TensorDataset(prediction_inputs, prediction_masks)
      prediction_sampler = SequentialSampler(prediction_data)
      prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)   
    else:
      labels = [int(label) for label in labels]
      labels = torch.tensor(labels)
      prediction_data = TensorDataset(prediction_inputs, prediction_masks, labels)
      prediction_sampler = SequentialSampler(prediction_data)
      prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
    return prediction_dataloader

# Fine Tunning

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def fine_tune(X_train, y_train, X_val, y_val):
  model = BertForSequenceClassification.from_pretrained(TEXT_MODEL_DIR) # AlbertForSequenceClassification.from_pretrained(TEXT_MODEL_DIR)
  model.cuda() 

  train_dataloader = load_data(X_train,y_train)
  validation_dataloader  = load_data(X_val,y_val)

  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
  ]

  # Parameters:
  lr = 2e-5
  max_grad_norm = 1.0

  # Number of training epochs (authors recommend between 2 and 4)
  epochs = 4

  num_training_steps = epochs * len(train_dataloader)
  num_warmup_steps = 200

  optimizer = AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

  t = [] 

  # Store our loss and accuracy for plotting
  train_loss_set = []
  val_loss_set = []

  # trange is a tqdm wrapper around the normal python range
  for _ in trange(epochs, desc="Epoch"):
    # Training
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()
    
    # Tracking variables
    tr_loss = 0
    
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch
      # Clear out the gradients (by default they accumulate)
      optimizer.zero_grad()
      # Forward pass
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      loss = outputs.loss
      train_loss_set.append(loss.item())    
      # Backward pass
      loss.backward()
      # Update parameters and take a step using the computed gradient
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
      optimizer.step()
      scheduler.step()    
      
      # Update tracking variables
      tr_loss += loss.item()

    print("Train loss: {}".format(tr_loss/len(train_dataloader)))
      
      
    # Validation
    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch
      # Telling the model not to compute or store gradients, saving memory and speeding up validation
      with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs.logits
        loss = outputs.loss
        val_loss_set.append(loss.item()) 
      
      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      # label_ids = b_labels.numpy()

      tmp_eval_accuracy = flat_accuracy(logits, label_ids)
      
      eval_accuracy += tmp_eval_accuracy
      eval_loss += loss.item()
    print("Validation loss: {}".format(eval_loss/len(validation_dataloader)))

  print("Validation Accuracy: {}".format(eval_accuracy/len(validation_dataloader)))
  return (model, train_loss_set, val_loss_set)

In [None]:
(model, train_loss_set, val_loss_set) = fine_tune(X_train, y_train, X_val, y_val)

In [None]:
model.save_pretrained(WORKSPACE + "/Models/BERT_fine_tune_5")

In [None]:
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set, 'b-', label="Training loss")
plt.plot(val_loss_set, 'g-', label="Validation loss")
plt.show()

# Evaluate

In [None]:
def get_predictions(model, sentences):
  # Put model in evaluation mode
  model.eval()
  predictions = []
  data_loader = load_data(sentences)

  # Predict 
  for batch in data_loader:
      batch = tuple(t.to(device) for t in batch)

      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask = batch

      # Telling the model not to compute or store gradients, saving memory and speeding up prediction
      with torch.no_grad():
          # Forward pass, calculate logit predictions
          outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, return_dict=True)
          logits = outputs.logits  

      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()

      # Store predictions and true labels
      predictions.extend(np.argmax(logits, axis=1))
  return predictions

In [None]:
predictions = get_predictions(model,X_test)
print(len(predictions))

In [None]:
report = classification_report(y_test, predictions, zero_division=0)
print(report)

In [None]:
cm = confusion_matrix(y_test, predictions, labels=[0,1])
ConfusionMatrixDisplay(cm, display_labels=["non-offensive","offensive"]).plot(values_format='')

In [None]:
incorrect_predictions_df = pd.DataFrame()
i = 0
for (index,true_label) in y_test.iteritems():
  if true_label != predictions[i]:
    incorrect_predictions_df = incorrect_predictions_df.append(df.iloc[index])
  i += 1
    
 print(incorrect_predictions_df.shape)  