# **q26752aa-Task2**

## Setup Environment
---

In [1]:
from IPython.display import clear_output

# All of these except 'datasets' are already pre-installed with google colab
!pip install numpy
!pip install pandas
!pip install nltk
!pip install scikit-learn
!pip install datasets
!pip install 'transformers[torch]'

clear_output() # Clear output to hide visual clutter

In [2]:
# Main libraries
import numpy as np
import pandas as pd
import nltk
import torch
import time

# Paths for input and output csv files
USERNAME = "10895316"
INPUT_FOLDER = "data"
OUTPUT_FOLDER = "data"
TRAINING_PATH = f"{INPUT_FOLDER}/Training-dataset.csv"
VALIDATION_PATH = f"{INPUT_FOLDER}/Task-2-validation-dataset.csv"
TEST_PATH = f"{INPUT_FOLDER}/Task-2-test-dataset1.csv"

CLASSES = ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']

In [3]:
# Function which calculates the time measured from a start time
def measure_time(start_time):
    end_time = time.time()
    return end_time - start_time

## Load Data
---
For training, I do include the movie **title** because most of the time they are a prominent clue to the genre of a movie.
<br>
<br>
For example (movie titles):
* **The Violent Kind** - A big giveaway that this movie is likely 'violence' themed.



* **Whodunit in the Wild West** - Whodunit should be a very rare word token, however it clearly represents a movie that may be comedic.

In [4]:
# Load the .csv training dataset as pandas dataframe
training_set = pd.read_csv(TRAINING_PATH, engine='python', encoding='utf-8')

# Load the .csv validation dataset as pandas dataframe
validation_set = pd.read_csv(VALIDATION_PATH, engine='python', encoding='utf-8')

# Load the .csv test dataset as pandas dataframe
test_set = pd.read_csv(TEST_PATH, engine='python', encoding='utf-8')

In [5]:
# Select features (text) and labels (class columns)
X_train = training_set['title'] + ' ' + training_set['plot_synopsis']
y_train = training_set.loc[:, CLASSES]  # Assuming the class columns start from index 3, adjust accordingly

X_val = validation_set['title'] + ' ' + validation_set['plot_synopsis']
y_val = validation_set.loc[:, CLASSES]

X_test = test_set['title'] + ' ' + test_set['plot_synopsis']

## Method A - **SVM** Model
---

In [6]:
# Hyperparameters
MAX_FEATURES = 1000
KERNEL = 'linear'
CLASS_WEIGHT = None
ANALYZER = 'word'
NGRAM_RANGE = (1,1)

In [7]:
# Function which fits and transforms input data into a matrix
def prepare_data_svm(X, vectorizer):
    return vectorizer.fit_transform(X)

Pre-process and prepare data for the *SVM* model.

In [8]:
# Import scikit-learn tf*idf vectorizer and stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

start_time = time.time()

# Download stopwords
nltk.download('stopwords')

# Define vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=MAX_FEATURES,
    stop_words=stopwords.words('english'),
    analyzer=ANALYZER,
    ngram_range=NGRAM_RANGE
)
# (punctuation is completely ignored and always treated as a token separator)

# Fit and transform all data into matrices
tfidf_train = prepare_data_svm(X_train, tfidf_vectorizer)
tfidf_valid = prepare_data_svm(X_val, tfidf_vectorizer)
tfidf_test = prepare_data_svm(X_test, tfidf_vectorizer)

print(f"Time elapsed to pre-process data: {measure_time(start_time)} seconds")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AAlsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Time elapsed to pre-process data: 6.6480560302734375 seconds


Train the *SVM* model on the pre-processed vectorised training data.

In [9]:
# Import scikit-learn SVM multiclass libraries
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

start_time = time.time()

# Train a SVM model with OneVsRest strategy
svm_classifier = OneVsRestClassifier(
    SVC(
        kernel=KERNEL,
        random_state=42,
        class_weight=CLASS_WEIGHT,
        verbose=True,
        probability=True
        ),
    n_jobs=1
    )

svm_classifier.fit(tfidf_train, y_train)

print(f"\nTime elapsed to train SVM model: {measure_time(start_time)} seconds")

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]
Time elapsed to train SVM model: 1538.1262187957764 seconds


## Method C - **RoBERTa** Model
---

In [None]:
# Hyperparameters
MAX_LENGTH = 512
BATCH_SIZE = 16
NUM_EPOCHS = 3
MODEL_NAME = "roberta-base"
LEARNING_RATE = 2e-5

Define a function that converts a *Pandas DataFrame* into a *Dataset* object containing the data encodings, formatted in a way accepted by the *Transformers Trainer* class.

In [None]:
# Import huggingface dataset structure
from datasets import Dataset

# Function that converts dataframe into a dataset and then encodes the data
def prepare_data_bert(dataframe, tokenizer, test=False):
    def tokenize_and_encode(examples):
        # return tokenizer(examples["plot_synopsis"], truncation=True)
        return tokenizer(examples["plot_synopsis"], truncation=True, padding='max_length', max_length=MAX_LENGTH)


    # Create panda dataframes from dataset objects
    dataset = Dataset.from_pandas(dataframe.iloc[:, 2:])

    # If test data then do not add class labels
    if test:
        encodings = dataset.map(tokenize_and_encode, batched=True, remove_columns=["plot_synopsis"])
        encodings.set_format("torch")
        return encodings

    # Create new labels column by reading genre class columns
    cols = [c for c in dataset.column_names if c != "plot_synopsis"]
    cols = dataset.column_names
    dataset = dataset.map(lambda x : {"labels": [x[c] for c in cols if c != "plot_synopsis"]})

    # Tokenize input data
    encodings = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)

    # Convert labels to torch floats
    encodings.set_format("torch")
    encodings = (encodings.map(lambda x : {"float_labels": x["labels"].to(torch.float)},
                remove_columns=["labels"]).rename_column("float_labels", "labels"))

    return encodings


Load model and tokenizer, then encode data.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load model and tokenizer using AutoTokenizer and AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, problem_type="multi_label_classification", num_labels=len(CLASSES))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, problem_type="multi_label_classification")

# Tokenize and encode input data
train_encodings = prepare_data_bert(training_set, tokenizer)
val_encodings = prepare_data_bert(validation_set, tokenizer)
test_encodings = prepare_data_bert(test_set, tokenizer, test=True)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/8257 [00:00<?, ? examples/s]

Map:   0%|          | 0/8257 [00:00<?, ? examples/s]

Map:   0%|          | 0/8257 [00:00<?, ? examples/s]

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Define the training arguments and trainer which will fine-tune the pre-trained model,

In [None]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    output_dir="./fine-tuned-model",
    num_train_epochs=NUM_EPOCHS,
    evaluation_strategy="epoch",
    learning_rate=LEARNING_RATE,
    )

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=val_encodings,
    tokenizer=tokenizer,
)

Fine-tune the model on the training data.

In [None]:
start_time = time.time()

# Fine-tune the model
trainer.train()

print(f"Time elapsed to fine-tune {MODEL_NAME} model: {measure_time(start_time)} seconds")

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.4187,0.382817
2,0.3714,0.366035
3,0.3462,0.361032


Time elapsed to fine-tune roberta-base model: 2473.7947068214417 seconds


## Evaluation Functions
---

Helper functions to streamline prediction and evaluation process:
*   **apply_thresholds** - Apply thresholding on a probability prediction array to get a binary prediction array.
*   **assign_highest_probability** - Documents which were not classified to any labels, because of a high threshold, will be forced to set the highest probability label as their class. As a result, every document will have at least one label assigned.
*   **get_predictions** - Get the binary predictions and/or probability predictions from the model, and apply two above helper functions to prepare them.
*   **evaluate** - Parent function which utilises all above functions to select the model, get the predictions and output it to a .csv file.



In [11]:
from scipy.special import expit

# Apply thresholding to a probability prediction array
def apply_thresholds(probability_array, thresholds):
    binary_predictions = (probability_array > np.array(thresholds)).astype(int)
    return binary_predictions

# From coursework specification: "Each document has at least one label assigned to it"
# This function will look for documents with no classes assigned and select the highest probability class
def assign_highest_probability(binary_array, probability_array):
    # Find rows with no 1s in the binary array
    unclassed_rows = np.where(binary_array.sum(axis=1) == 0)[0]

    # Iterate through unclassed rows and assign a single 1 to the column with the highest probability
    for row in unclassed_rows:
        max_prob_column = np.argmax(probability_array[row, :])
        binary_array[row, max_prob_column] = 1

    return binary_array

# Function which gets binary predictions from model on a given dataset
def get_predictions(method, model, data):
    # Predict from correct model, and apply helper functions to predicted outputs.
    if method == "a":
        # Get binary and probability predictions from SVM model
        predictions_bin = model.predict(data)
        predictions_prob = model.predict_proba(data)

    else:
        # Convert predicted logits to predicted probabilities
        logits = model.predict(data).predictions
        predictions_prob = expit(logits)

        # Apply thresholding on predicted probabilities to get binary predictions
        thresholds = [0.4 for _ in range(len(CLASSES))]
        predictions_bin = apply_thresholds(predictions_prob, thresholds)

    # Ensure every document has at least one label assigned
    predictions_bin = assign_highest_probability(predictions_bin, predictions_prob)
    return predictions_bin

# Function which runs predictions using the trained model, and outputs the results to a .csv file
def evaluate(method, dataset, encoding, test=False):
    start_time = time.time()

    # Get binary label prediction from correct model
    if method == "a":
        predictions = get_predictions(method=method, model=svm_classifier, data=encoding)
    else:
        predictions = get_predictions(method=method, model=trainer, data=encoding)

    # Create new dataframe from predictions
    output = pd.DataFrame(predictions, columns=CLASSES)

    # Copy and insert the ID column before the output
    id_column = dataset.iloc[:, 0].copy()
    output = pd.concat([id_column, output], axis=1)

    # Create path name and .csv file
    if test:
        output_path = f"{OUTPUT_FOLDER}/{USERNAME}-Task2-method-{method}.csv"
    else:
        output_path = f"{OUTPUT_FOLDER}/{USERNAME}-Task2-method-{method}-validation.csv"
    output.to_csv(output_path, columns=['ID'] + CLASSES, header=False, index=False)

    print(f"Time elapsed to evaluate model: {measure_time(start_time)} seconds")
    return output_path

## Evaluation on Validation Set
---

Evaluation and precision/recall result for 'Method A - **SVM** model' on the validation set.

In [13]:
# Run evaluation script
output_path_a = evaluate(method="a", dataset=validation_set, encoding=tfidf_valid)

Time elapsed to evaluate model: 72.66827702522278 seconds
Class level: 
Class  1 precision: 0.0000 recall: 0.0000
Class  2 precision: 0.2667 recall: 0.0486
Class  3 precision: 0.2941 recall: 0.0680
Class  4 precision: 0.0000 recall: 0.0000
Class  5 precision: 0.5512 recall: 0.6024
Class  6 precision: 0.4091 recall: 0.0380
Class  7 precision: 0.5112 recall: 0.3138
Class  8 precision: 0.0000 recall: 0.0000
Class  9 precision: 0.4464 recall: 0.3571
----------------------------
Movie (document) level: 
Precision: 0.4815
Recall: 0.3012


Evaluation and precision/recall result for 'Method C - Fine-tuned **RoBERTa** model' on the validation set.

In [None]:
# Run evaluation script
output_path_c = evaluate(method="c", dataset=validation_set, encoding=val_encodings)

Time elapsed to evaluate model: 40.10171604156494 seconds
Class level: 
Class  1 precision: 0.5610 recall: 0.2629
Class  2 precision: 0.5477 recall: 0.4413
Class  3 precision: 0.5738 recall: 0.4626
Class  4 precision: 0.5000 recall: 0.0417
Class  5 precision: 0.6997 recall: 0.8021
Class  6 precision: 0.4481 recall: 0.2911
Class  7 precision: 0.6493 recall: 0.6448
Class  8 precision: 1.0000 recall: 0.0968
Class  9 precision: 0.6084 recall: 0.7619
----------------------------
Movie (document) level: 
Precision: 0.6295
Recall: 0.6138


## Evaluation on Test Set
---

Evaluation and precision/recall result for 'Method A - **SVM** model' on the test set.

In [14]:
# Run evaluation script
evaluate(method="a", dataset=test_set, encoding=tfidf_test, test=True)

Time elapsed to evaluate model: 74.67058372497559 seconds


'data/10895316-Task2-method-a.csv'

Evaluation and precision/recall result for 'Method C - **Fine-tuned Bert** model' on the test set.

In [None]:
# Run evaluation script
evaluate(method="c", dataset=test_set, encoding=test_encodings, test=True)

Time elapsed to evaluate model: 39.461236238479614 seconds


'data/10895316-Task2-method-c.csv'