# Prepapre Env

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install torch torchvision
!pip install transformers

# Prepare Workspace

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

from tqdm import trange
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla K80'

In [None]:
WORKSPACE = "drive/MyDrive/HushUp/OffensiveTextClassifier"
DATA_FILE = WORKSPACE + "/text_data.csv"
MODEL_DIR = "bert-base-uncased"

# Data Preprocessing

In [None]:
df = pd.read_csv(DATA_FILE)
print(df.shape)
print(df.sample(5))

In [None]:
# Create sentence and label lists
sentences = np.array(df['Text'])

# Add special tokens at the beginning and end of each sentence
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = np.array(df['Label'])
labels = [int(label) for label in labels]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print("Tokenize the first sentence:")
print(tokenized_texts[0])

Tokenize the first sentence:
['[CLS]', 'do', 'jews', 'wear', 'those', 'hats', 'to', 'cover', 'their', 'horns', '?', '[SEP]']


In [None]:
# Set the maximum sequence length.
MAX_LEN = 128

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print(input_ids[0])

[  101  2079  5181  4929  2216 16717  2000  3104  2037 11569  1029   102
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]


In [None]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
# Use train_test_split to split our data into train and validation sets for training
train_inputs, temp_inputs, train_labels, temp_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.2)
test_inputs, validation_inputs, test_labels, validation_labels = train_test_split(temp_inputs, temp_labels, 
                                                            random_state=2018, test_size=0.5)

train_masks, temp_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.2)
test_masks, validation_masks, _, _ = train_test_split(temp_masks, temp_inputs,
                                             random_state=2018, test_size=0.5)

In [None]:
print("Dataset distribution")
print("train\n", np.unique(train_labels, return_counts=True))
print("validation\n",np.unique(validation_labels, return_counts=True))
print("test\n",np.unique(test_labels, return_counts=True))

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Train Model

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
model.cuda()

In [None]:
# Select a batch size for training
batch_size = 4

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
# This variable contains all of the hyperparemeter information our training loop needs
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# Parameters:
lr =  5e-6
max_grad_norm = 1.0

# Number of training epochs (authors recommend between 2 and 4)
epochs = 6

num_training_steps = epochs * len(train_dataloader)
num_warmup_steps = 2000

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []
val_loss_set = []

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  # Training
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs.loss
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
    optimizer.step()
    scheduler.step() 
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/len(train_dataloader)))
    
    
  # Validation
  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      logits = outputs.logits
      loss = outputs.loss
      val_loss_set.append(loss.item()) 
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
    eval_loss += loss.item()
  print("Validation loss: {}".format(eval_loss/len(validation_dataloader)))
  print("Validation Accuracy: {}".format(eval_accuracy/len(validation_dataloader)))

In [None]:
model.save_pretrained(WORKSPACE + "/Models/BERT_5")

# Evaluate Model

In [None]:
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set, 'b-')
plt.plot(val_loss_set, 'g-')
plt.show()

In [None]:
prediction_inputs = torch.tensor(test_inputs)
prediction_masks = torch.tensor(test_masks)
prediction_labels = torch.tensor(test_labels)  

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# Prediction on test set
# Put model in evaluation mode
model.eval()
model.cuda()

# Tracking variables 
predictions , true_labels = [], []

tmp_test_accuracy = 0
test_accuracy = 0
nb_test_steps = 0

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

  tmp_test_accuracy = flat_accuracy(logits, label_ids)
    
  test_accuracy += tmp_test_accuracy 
  nb_test_steps += 1

print("Testing Accuracy: {}".format(test_accuracy/nb_test_steps))

Testing Accuracy: 0.8269230769230769


Matthew Scores

In [None]:
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import matthews_corrcoef
matthews_set = []

for i in range(len(true_labels)):
  matthews = matthews_corrcoef(true_labels[i],
                 np.argmax(predictions[i], axis=1).flatten())
  matthews_set.append(matthews)

# Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]
print("Matthew Score: ",matthews_corrcoef(flat_true_labels, flat_predictions))

In [None]:
# Classification report
report = classification_report(flat_true_labels, flat_predictions, zero_division=0)
print(report)

In [None]:
# Confusion Matrix
cm = confusion_matrix(flat_true_labels, flat_predictions, labels=[0,1])
ConfusionMatrixDisplay(cm, display_labels=["non-offensive","offensive"]).plot(values_format='')

In [None]:
# Check False Positives
df = pd.DataFrame()
df['predictions'] = flat_predictions
df['correct_labels'] = flat_true_labels
print(df.groupby(df['predictions']).count())
print(df.groupby(df['correct_labels']).count())
incorrect_predictions = df[df['predictions'] != df['correct_labels']]

fp = incorrect_predictions[incorrect_predictions['predictions'] == 1].index.values
fn = incorrect_predictions[incorrect_predictions['predictions'] == 0].index.values

print("#false positives: " + str(len(fp)))
print("#false negatives " + str(len(fn)))

np.array(sentences)[fn]