# COMP34711 Natural Language Processing – Task 2: Text classification

## Setup

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
# Mount google drive for ease
# from google.colab import drive
# drive.mount('/content/drive')

# Specify the folder path
# Make sure that your code uses the following relative path to a folder with the dataset(s): ./data/
# folder_path = '/content/drive/My Drive/NLP Coursework'
folder_path = './data'

In [None]:
training_data = pd.read_csv(folder_path + "/Training-dataset.csv")
training_data.head()

Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,8f5203de-b2f8-4c0c-b0c1-835ba92422e9,Si wang ta,"After a recent amount of challenges, Billy Lo ...",0,0,0,0,1,1,0,0,1
1,6416fe15-6f8a-41d4-8a78-3e8f120781c7,Shattered Vengeance,"In the crime-ridden city of Tremont, renowned ...",0,0,0,0,1,1,1,0,1
2,4979fe9a-0518-41cc-b85f-f364c91053ca,L'esorciccio,Lankester Merrin is a veteran Catholic priest ...,0,1,0,0,0,0,0,0,0
3,b672850b-a1d9-44ed-9cff-025ee8b61e6f,Serendipity Through Seasons,"""Serendipity Through Seasons"" is a heartwarmin...",0,0,0,0,0,0,1,0,0
4,b4d8e8cc-a53e-48f8-be6a-6432b928a56d,The Liability,"Young and naive 19-year-old slacker, Adam (Jac...",0,0,1,0,0,0,0,0,0


In [None]:
validation_data = pd.read_csv(folder_path + "/Task-2-validation-dataset.csv")
validation_data.head()

Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,cf32cb00-172d-40f2-a3c1-936e8a0d89d7,Shattered Hearts,"In the enchanting city of Verona, Italy, renow...",0,0,0,0,1,0,1,0,0
1,df7e125e-2d59-40e4-a126-9397e3a0ef21,Point Blank,Walker works with his friend Mal Reese to stea...,0,1,1,0,1,1,0,0,1
2,49bc73f3-9179-41cd-9774-905c7a3ac91b,Le charme discret de la bourgeoisie,The film consists of several thematically link...,1,0,1,0,0,0,0,0,0
3,0ed4822b-87af-44bc-a677-7f7abfdaccf3,A Gentleman's Dignity,A Gentleman's Dignity is about the careers and...,0,0,0,0,0,0,1,0,0
4,0b1b0fa4-43bc-41ba-9598-b3401894b96d,Carmen: A Hip Hopera,"Carmen Brown (Beyoncé) is a seductive, aspirin...",0,0,0,0,1,0,0,0,0


In [None]:
testing_data = pd.read_csv(folder_path + "/Task-2-test-dataset1.csv")

## Method a) Developing a “traditional” classification method (SVM)

#### Importing Libraries

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

import pickle # To save SVM model

### Data Preparation / Processing

In [None]:
def prepare_data_SVM(df):
  new_df = df.copy()
  # Combine 'title' and 'plot_synopsis' column to get more context
  new_df['sentences'] = new_df['title'] + ". " + new_df['plot_synopsis']
  # Drop useless features/columns
  new_df.drop(columns=['title', 'plot_synopsis', 'synopsis_source'], inplace=True, errors='ignore')
  return new_df

In [None]:
# Prepare dataframes
train_data = prepare_data_SVM(training_data)
val_data = prepare_data_SVM(validation_data)

In [None]:
# Set target labels
labels = ['comedy','cult','flashback','historical','murder','revenge','romantic','scifi','violence']

# Separate labels
y_train = []
y_val = []
for label in labels:
  y_train.append(train_data[label].values)
  y_val.append(val_data[label].values)

Here, the TFIDFVectorizer is equivalent to CountVectorizer followed by TfidfTransformer.

CountVectorizer convert all characters to lowercase and tokenizes it. There is also an option to remove stopwords using this Vectorizer. However, my results performance decreases when I removed stopwords, so I will not be removing them.

In [None]:
# Load a sample dataset for TF-IDF Vectorizer
X_train = train_data['sentences']
X_val = val_data['sentences']

# Convert text data to numerical vectors using TF-IDF Vectorizer: case folding and tokenization
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

### Hyperparameter Tuning

There is a paper "Efficiency of SVM classifier with Word2Vec and Doc2Vec models" by Maria Mihaela TRUȘCĂ (2019) that concludes Word2Vec CBOW/Skipgram embeddings performs better (in all metrics: recall, precision, F1) on SVM classification.

I tried to train the SVM model with Word2Vec Embeddings, but it takes too long to run.

Therefore, I will be using the standard TF-IDF Transformer instead.

So, I tuned using the following parameters:
- kernel: ['linear', 'rbf', 'poly', 'sigmoid']
- C (SVM regularization parameter): [2e-02, 2e-01, 1, 2, 2e+1] followed by [0.2, 0.5, 1, 2] and followed by [1,2,3,4,5]
- loss: ["squared_hinge", "hinge"]

I also tried to compare training using SVC and Linear SVC; both have similar performance (F1 score, precision, recall) given similar C hyperparameter. However, the latter trains almost 10x faster compared to the former.

Hence, results show C = 3 and loss = "squared_hinge" using Linear SVC gives the best result and the fastest training time.

### Training Model

In [None]:
# Define hyperparameters
C_hp = 3

In [None]:
# Model Training
for i in tqdm(range(len(y_train))):
  # Instantiate model
  svm_model = LinearSVC(C=3, random_state=42, loss="squared_hinge")

  # Train the model
  svm_model.fit(X_train_vectorized, y_train[i])

  # Save the model
  with open(f'{folder_path}/svm_model{i}.pkl','wb') as f:
    pickle.dump(svm_model,f)

100%|██████████| 9/9 [00:15<00:00,  1.75s/it]


### Evaluation

#### On validation data

In [None]:
y_predictions = []

for i in tqdm(range(len(y_train))):
  # Load model
  with open(f'{folder_path}/svm_model{i}.pkl', 'rb') as f:
    svm_model = pickle.load(f)

  # Make predictions on the validation set
  y_pred = svm_model.predict(X_val_vectorized)
  y_predictions.append(y_pred)

# Combine predicted labels
y = np.stack( y_predictions, axis=0 )
y = np.transpose(y)

# Evaluate metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_true=val_data[labels].values, y_pred=y, average='macro')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}\n")

# Save results to file
with open(folder_path + "/10967103-Task2-method-a-validation.csv", "w") as f:
  for i in range(len(val_data)):
    doc_pred = ",".join(map(str, y[i]))
    f.write(f"{val_data['ID'][i]},{doc_pred}\n")

# Run evaluation script
results = !./data/task2_eval_script_student_version.py ./data/10967103-Task2-method-a-validation.csv ./data/Task-2-validation-dataset.csv
print(*results,sep='\n')

100%|██████████| 9/9 [00:00<00:00, 112.30it/s]


Precision: 0.5514504848297701
Recall: 0.3460045614234803
F1: 0.40340621396952386

/bin/bash: line 1: ./data/task2_eval_script_student_version.py: Permission denied


#### On test data

In [None]:
# Processing test data
test_data = prepare_data_SVM(testing_data)

# Load a sample dataset for TF-IDF Vectorizer
X_test = test_data['sentences']

# Convert text data to numerical vectors using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

y_predictions = []

for i in tqdm(range(len(y_train))):
  # Load model
  with open(f'{folder_path}/svm_model{i}.pkl', 'rb') as f:
    svm_model = pickle.load(f)

  # Make predictions on the validation set
  y_pred = svm_model.predict(X_test_vectorized)
  y_predictions.append(y_pred)

# Combine predicted labels
y = np.stack( y_predictions, axis=0 )
y = np.transpose(y)

# Save results to file
with open(folder_path + "/10967103-Task2-method-a.csv", "w") as f:
  for i in range(len(test_data)):
    doc_pred = ",".join(map(str, y[i]))
    f.write(f"{test_data['ID'][i]},{doc_pred}\n")

100%|██████████| 9/9 [00:00<00:00, 439.19it/s]


## Method c) Fine-tuning a pre-trained model (either T5, BART, BERT or RoBERTa) for the classification task.

#### Install Libraries

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     

In [None]:
import torch
from datasets import Dataset, DatasetDict
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer

### Data Preparation / Processing

There will not be much done here in terms of preprocessing the sentences itself, as the pretrained model's tokenizer will take care of it.

I think preprocessing (stemming or lemmatization) will not change our output predictions. Because BERT/RoBERTa uses BPE (Byte-Pair Encoding) to shrink its vocab size. Words like running will be decoded to run + ##ing. So it's better not to convert running into run because it will decrease the amount of information.

Other minor preparation for the dataset itself is done here.

In [None]:
def prepare_data_BERT(df):
  '''Combines input texts/sentences and labels into its own column'''
  new_df = df.copy()
  labels = ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']

  # Combine 'title' and 'plot_synopsis' column to get more context
  new_df['sentences'] = new_df['title'] + ". " + new_df['plot_synopsis']
  # Combine all labels into 1 array
  if 'comedy' in new_df.columns:
    new_df['labels'] = new_df[labels].applymap(float).values.tolist()
  # Drop useless features/columns
  new_df.drop(columns=['title', 'plot_synopsis', 'synopsis_source' ] + labels, inplace=True, errors='ignore')

  return new_df

In [None]:
# Prepare dataframe to desired columns: input sentences and labels
train_df = prepare_data_BERT(training_data)
val_df = prepare_data_BERT(validation_data)

# Convert to dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
# Create list of target labels for later training
labels = [label for label in training_data.columns if label not in ['ID', 'title', 'plot_synopsis']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['comedy',
 'cult',
 'flashback',
 'historical',
 'murder',
 'revenge',
 'romantic',
 'scifi',
 'violence']

In [None]:
# Our first hyperparameter defined (discussed in the next section)
MODEL_NAME = "roberta-large"

In [None]:
# Instantiate tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

In [None]:
def tokenize_truncated(examples, tokenizer):
  ''' Truncate text if more than max length, used for validation data '''
  return tokenizer(examples['sentences'], padding='max_length', max_length=512, truncation=True, return_tensors='pt')

def tokenize_split_when_truncated(examples, tokenizer):
  ''' Split overflowing sentences to keep all text, used for training data '''
  sentences = examples['sentences']
  result = tokenizer(sentences, padding='max_length', max_length=512, truncation=True, return_overflowing_tokens=True, return_tensors='pt')

  examples_map = result.pop('overflow_to_sample_mapping')
  for key, values in examples.items():
    result[key] = [values[i] for i in examples_map]
  return result

def preprocess_dataset(examples, training=1):
  # Encode/tokenize
  if training: return tokenize_split_when_truncated(examples, tokenizer)
  else: return tokenize_truncated(examples, tokenizer)

In [None]:
# Tokenize / encode dataset

# For training dataset, truncated or overflowing embeddings are "generated" into a new sample with the same labels
# to maximise the amount of training data used
encoded_train_dataset = train_dataset.map(lambda s: preprocess_dataset(s, training=1), batched=True)

# For validation dataset and testing dataset,
# Truncated embeddings are "thrown away"
encoded_val_dataset = val_dataset.map(lambda s: preprocess_dataset(s, training=0), batched=True)

Map:   0%|          | 0/8257 [00:00<?, ? examples/s]

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

In [None]:
# Convert all data into 1 dataset dictionary for ease
dataset = DatasetDict({"train":encoded_train_dataset,"val":encoded_val_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'sentences', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 21504
    })
    val: Dataset({
        features: ['ID', 'sentences', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1188
    })
})

### Hyperparameter Tuning

Doing hyperparameter tuning on the following parameters :
 - pre-trained model: BERT and RoBERTa (large)
 - threshold = [0.3, 0.4, 0.5] followed by [0.3, 0.35, 0.4]
 - learning_rate = [1e-05, 2e-05, 5e-05]

The results shows that threshold = 0.4 and learning_rate=1e-05 on 'roberta_large' pre-trained model has the best F1 (macro) score.

### Model Fine-tuning

In [None]:
# Run model on GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
# Define Hyperparameters
LEARNING_RATE = 1e-05
NUM_EPOCHS = 3
BATCH_SIZE = 12
THRESHOLD = 0.35
SEED = 42

In [None]:
def multi_label_metrics(predictions, labels, threshold=THRESHOLD):
  '''Compute F1 score, Precision, Recall, and Accuracy metric for fine-tuning '''
  # Apply sigmoid on predictions which are of shape (batch_size, num_labels)
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  # Use threshold to turn them into integer predictions
  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs >= threshold)] = 1

  # Compute metrics
  precision, recall, f1, _ = precision_recall_fscore_support(y_true=labels, y_pred=y_pred, average='macro')
  accuracy = accuracy_score(labels, y_pred)

  metrics = {'f1': f1,
      'precision': precision,
      'recall': recall,
      'accuracy': accuracy}
  return metrics

def compute_metrics(p):
  return multi_label_metrics(predictions=p.predictions, labels=p.label_ids)

In [None]:
# Define model
def model_init():
  return RobertaForSequenceClassification.from_pretrained(MODEL_NAME,
                                                          num_labels=len(labels),
                                                          problem_type="multi_label_classification",
                                                          id2label=id2label,
                                                          label2id=label2id,
                                                        ).to(device)

# Define training arguments
training_args = TrainingArguments("test_trainer",
                                  per_device_train_batch_size=BATCH_SIZE//3,
                                  per_device_eval_batch_size=BATCH_SIZE//3,
                                  num_train_epochs=NUM_EPOCHS,
                                  learning_rate=LEARNING_RATE,
                                  weight_decay=0.02,
                                  optim="adamw_torch",
                                  save_strategy='epoch',
                                  evaluation_strategy="epoch",
                                  load_best_model_at_end=True, save_total_limit=1,
                                  metric_for_best_model='f1', greater_is_better=True,
                                  gradient_accumulation_steps=4,
                                  eval_accumulation_steps=4,
                                  fp16=True,                    # only for GPU, comment if using CPU
                                  seed=SEED,
                                  )
# Instantiate trainer
trainer = Trainer(model_init=model_init,
                  args=training_args,
                  train_dataset=dataset['train'],
                  eval_dataset=dataset['val'],
                  compute_metrics=compute_metrics)

# Train or fine tune model
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.3968,0.34982,0.507175,0.622658,0.510972,0.234007
2,0.3342,0.348042,0.536636,0.576597,0.531102,0.244949
3,0.2871,0.35241,0.554547,0.575568,0.554584,0.253367


TrainOutput(global_step=4032, training_loss=0.3458984884477797, metrics={'train_runtime': 2675.3829, 'train_samples_per_second': 24.113, 'train_steps_per_second': 1.507, 'total_flos': 6.012217805871514e+16, 'train_loss': 0.3458984884477797, 'epoch': 3.0})

### Evaluation


#### On validation set

In [None]:
# Evaluate fine-tuned model on (target) dataset

logits = trainer.predict(dataset['val']).predictions

sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(logits))

y_pred = np.zeros(probs.shape)
y_pred[np.where(probs >= THRESHOLD)] = 1

with open (folder_path + "/10967103-Task2-method-c-validation.csv", "w") as f:
  for i in range(len(y_pred)):
    pred = ",".join([str(int(e)) for e in y_pred[i]])
    f.write(f"{validation_data['ID'][i]},{pred}\n")

results = !python3 ./data/task2_eval_script_student_version.py ./data/10967103-Task2-method-c-validation.csv ./data/Task-2-validation-dataset.csv
print(results)

['Class level: ', 'Class  1 precision: 0.4706 recall: 0.4114', 'Class  2 precision: 0.5379 recall: 0.5749', 'Class  3 precision: 0.5242 recall: 0.4796', 'Class  4 precision: 0.6000 recall: 0.2500', 'Class  5 precision: 0.6973 recall: 0.8761', 'Class  6 precision: 0.4885 recall: 0.4473', 'Class  7 precision: 0.6531 recall: 0.6621', 'Class  8 precision: 0.5714 recall: 0.5161', 'Class  9 precision: 0.6373 recall: 0.7738', '----------------------------', 'Movie (document) level: ', 'Precision: 0.6210', 'Recall: 0.6769']


On testing set

In [None]:
# Prepare dataframe to desired columns: input sentences and labels
test_df = prepare_data_BERT(testing_data)

# Convert to dataset
test_dataset = Dataset.from_pandas(test_df)

# Tokenize: Truncated embeddings are "thrown away"
encoded_test_dataset = test_dataset.map(lambda s: preprocess_dataset(s, training=0), batched=True)

# Evaluate fine-tuned model on (target) dataset
logits = trainer.predict(encoded_test_dataset).predictions

sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(logits))

y_pred = np.zeros(probs.shape)
y_pred[np.where(probs >= THRESHOLD)] = 1

# Generating results for test dataset
with open (folder_path + "/10967103-Task2-method-c.csv", "w") as f:
  for i in range(len(y_pred)):
    pred = ",".join([str(int(e)) for e in y_pred[i]])
    f.write(f"{testing_data['ID'][i]},{pred}\n")

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]