In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive/', force_remount=True)
# %cd gdrive/MyDrive/BT4222 Project Group/Codes

Mounted at /content/gdrive/
/content/gdrive/MyDrive/BT4222 Project Group/Codes


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
from sklearn.metrics import accuracy_score
import random
import pickle
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [None]:
rdn_index = random.sample(range(110248), 50000)

In [None]:
train_posts = pd.read_csv('./Data/Final Data - Test train/X_train.csv')['processed_str'].iloc[rdn_index]
test_posts = pd.read_csv('./Data/Final Data - Test train/X_test.csv')['processed_str']
train_labels = pd.read_csv('./Data/Final Data - Test train/y_train.csv')['class'].iloc[rdn_index]
test_labels = pd.read_csv('./Data/Final Data - Test train/y_test.csv')['class']

In [None]:
# Use BertTokenizer to:
#   - Add special tokens
#   - Pad and make sentences same length
#   - Create attention mask

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def preprocessing(input_text, tokenizer):
  '''
  Returns:
    - input_ids: list of token ids
    - attention_mask: list of indices (0,1) specifying which tokens should be considered by model
  '''
  encoding_dict = tokenizer.encode_plus(
      input_text,
      add_special_tokens = True,
      max_length = 155,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_tensors = "pt"
  )

  return encoding_dict['input_ids'], encoding_dict['attention_mask']

In [None]:
train_idx = []
train_a_m = []

for text in train_posts.values:
  input_ids, attention_mask = preprocessing(text, tokenizer)
  train_idx.append(input_ids)
  train_a_m.append(attention_mask)

train_idx = torch.cat(train_idx, dim = 0)
train_a_m = torch.cat(train_a_m, dim = 0)
train_lab = torch.tensor(train_labels.values)

val_idx = []
val_a_m = []

for text in test_posts.values:
  input_ids, attention_mask = preprocessing(text, tokenizer)
  val_idx.append(input_ids)
  val_a_m.append(attention_mask)

val_idx = torch.cat(val_idx, dim = 0)
val_a_m = torch.cat(val_a_m, dim = 0)
val_lab = torch.tensor(test_labels.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Specify batch size
BATCH_SIZE = 200

# Create train & validation sets
train_set = TensorDataset(
    train_idx,
    train_a_m,
    train_lab
)

val_set = TensorDataset(
    val_idx, 
    val_a_m,
    val_lab
)

# Prepare Dataloader
train_dataloader = DataLoader(
    train_set,
    sampler = RandomSampler(train_set),
    batch_size = BATCH_SIZE
)

val_dataloader = DataLoader(
    val_set,
    sampler = SequentialSampler(val_set),
    batch_size = BATCH_SIZE
)

In [None]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

In [None]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

for epoch in range(epochs):
    
    print(f'Epoch: {epoch + 1}')
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    print(' -- Training')
    for step, batch in tqdm(enumerate(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    print(' -- Validation')

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in tqdm(val_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

# save model
import pickle

file_name = "../Models/bert.pkl"
pickle.dump(model, open(file_name, "wb"))

Epoch: 1
 -- Training


250it [2:05:24, 30.10s/it]


 -- Validation


100%|██████████| 100/100 [15:47<00:00,  9.48s/it]



	 - Train loss: 0.3197
	 - Validation Accuracy: 0.9070
	 - Validation Precision: 0.8602
	 - Validation Recall: 0.9204
	 - Validation Specificity: 0.9011

Epoch: 2
 -- Training


250it [2:05:04, 30.02s/it]


 -- Validation


100%|██████████| 100/100 [15:55<00:00,  9.56s/it]



	 - Train loss: 0.1686
	 - Validation Accuracy: 0.9175
	 - Validation Precision: 0.8773
	 - Validation Recall: 0.9230
	 - Validation Specificity: 0.9163



In [None]:
# load
file_name = "../Models/bert.pkl"
bert_loaded = pickle.load(open(file_name, "rb"))

In [None]:
test_idx = []
test_a_m = []

for text in test_posts.values:
  input_ids, attention_mask = preprocessing(text, tokenizer)
  test_idx.append(input_ids)
  test_a_m.append(attention_mask)

test_idx = torch.cat(test_idx, dim = 0)
test_a_m = torch.cat(test_a_m, dim = 0)
test_lab = torch.tensor(test_labels.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Test Results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with torch.no_grad():
  output = bert_loaded(test_idx.to(device), token_type_ids = None, attention_mask = test_a_m.to(device))
  preds = np.argmax(output.logits.cpu().numpy(), axis = 1).flatten()

0.9155

In [None]:
# precision, f1 all here

data = {
    'Model': ['BERT'],
    'Accuracy': [accuracy_score(test_lab, preds)],
    'F1-score': [f1_score(test_lab, preds)],
    'Precision': [precision_score(test_lab, preds)],
    'Recall': [recall_score(test_lab, preds)]
}


df = pd.DataFrame(data)
 
# append data frame to CSV file
df.to_csv('../Model_Evaluation.csv', mode='a', index=False, header=False)

In [None]:
pd.read_csv('./Model_Evaluation.csv')

Unnamed: 0,Model,Accuracy,F1-score,Precision,Recall
0,LSTM,0.90222,0.9021,0.880611,0.868645
1,XGBoost,0.912271,0.912105,0.896376,0.877981
2,LightGBM,0.908388,0.90827,0.888795,0.876225
3,SVM,0.910602,0.910176,0.906007,0.86162
4,Logistic Reg,0.910239,0.909736,0.908859,0.857275
5,Random Forest,0.90193,0.901466,0.894277,0.850712
6,CNN,0.913432,0.913176,0.902367,0.874006
7,BERT,0.9155,0.893911,0.874693,0.913992
