# **q26752aa-Task1**

## Setup Environment
---

In [None]:
from IPython.display import clear_output

# All of these are already pre-installed with google colab but I left this just in case
!pip install numpy
!pip install pandas
!pip install nltk
!pip install scikit-learn
!pip install gensim
!pip install 'transformers[torch]'

clear_output() # Clear output to hide visual clutter

In [None]:
# Main libraries
import numpy as np
import pandas as pd
import nltk
import torch
import time
from sklearn.metrics.pairwise import cosine_similarity

# Paths for input and output csv files
USERNAME = "10895316"
INPUT_FOLDER = "data"
OUTPUT_FOLDER = "data"
TRAINING_PATH = f"{INPUT_FOLDER}/Training-dataset.csv"
VALIDATION_PATH = f"{INPUT_FOLDER}/Task-1-validation-dataset.csv"
TEST_PATH = f"{INPUT_FOLDER}/Task-1-test-dataset1.csv"

In [None]:
# Function which calculates the time measured from a start time
def measure_time(start_time):
    end_time = time.time()
    return end_time - start_time

## Load Data
---
For training, I do not include the movie **title** because this can sometimes obscure the actual context of the initial few words in the synopsis. Additionally, titles tend to contain little information to represent a word or even wrong ones entirely.
<br>
<br>
For example (movie titles):
* **Orca** - A single word title which provides zero information.



* **Blood and Chocolate** - Semantically, these words are not similar.

In [None]:
# Load the .csv training dataset as pandas dataframe
training_set = pd.read_csv(TRAINING_PATH, engine='python', encoding='utf-8')
validation_set = pd.read_csv(VALIDATION_PATH, engine='python', encoding='utf-8', header=None)
test_set = pd.read_csv(TEST_PATH, engine='python', encoding='utf-8', header=None)

# Ignore all columns except 'plot_synopsis'
training_set = training_set.loc[:, ['plot_synopsis']]
training_set.head()

Unnamed: 0,plot_synopsis
0,"After a recent amount of challenges, Billy Lo ..."
1,"In the crime-ridden city of Tremont, renowned ..."
2,Lankester Merrin is a veteran Catholic priest ...
3,"""Serendipity Through Seasons"" is a heartwarmin..."
4,"Young and naive 19-year-old slacker, Adam (Jac..."


## Method B - **word2vec** model
---

In [None]:
# Hyperparameters
STEMMER = False
BIGRAM = False
VECTOR_SIZE = 200
WINDOW_SIZE = 10
MIN_COUNT = 1
EPOCHS = 10
ALPHA = 0.025
MIN_ALPHA = 0.00005
SHRINK_WINDOWS = False

Import tokenizer and stopwords, then download them.

In [None]:
# Import tokenizer and stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

Pre-process and prepare training data as tokens for the *word2vec* model.

In [None]:
start_time = time.time()

# Preprocess the text
stop_words = set(stopwords.words('english'))
stemmer = nltk.PorterStemmer()

# Function which prepocesses and tokenizes text
def preprocess_text(text):
    tokens = word_tokenize(text.lower()) # Case folding and tokenize
    if STEMMER:
        tokens = [stemmer.stem(token) for token in tokens if token.isalpha() and token]  # Remove punctuation then stem
    else:
        tokens = [token for token in tokens if token.isalpha() and token] # Remove punctuation
    return tokens

# Apply preprocessing to the plot_synopsis column
training_tokens = pd.DataFrame(data=training_set['plot_synopsis'].apply(preprocess_text))

print(f"Time elapsed to pre-process text: {measure_time(start_time)} seconds")
training_tokens.head()

Time elapsed to pre-process text: 42.40755796432495 seconds


Unnamed: 0,plot_synopsis
0,"[after, a, recent, amount, of, challenges, bil..."
1,"[in, the, city, of, tremont, renowned, investi..."
2,"[lankester, merrin, is, a, veteran, catholic, ..."
3,"[serendipity, through, seasons, is, a, heartwa..."
4,"[young, and, naive, slacker, adam, jack, lives..."


Train the *word2vec* model on the pre-processed tokenized training data.

In [None]:
# Import word2vec model from Gensim
from gensim.models import Word2Vec, KeyedVectors, Phrases

start_time = time.time()

# Convert dataframe to array list
sentences = training_tokens['plot_synopsis'].tolist()

# Train a bigram detector for multiword phrases
if BIGRAM:
    bigram_transformer = Phrases(sentences)
    sentences = bigram_transformer[sentences]

# Train Word2Vec model
w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=VECTOR_SIZE,
    window=WINDOW_SIZE,
    min_count=MIN_COUNT,
    epochs=EPOCHS,
    alpha=ALPHA,
    min_alpha=MIN_ALPHA,
    shrink_windows=SHRINK_WINDOWS,
    )

# Store and use only the KeyedVectors instance to reduce memory.
w2v_model = w2v_model.wv

print(f"Time elapsed to train word2vec model: {measure_time(start_time)} seconds")

Time elapsed to train word2vec model: 45.6935453414917 seconds


Define the function which will calculate the cosine similarity between two input terms using the *word2vec* model and it's generated embeddings.

In [None]:
def calculate_cosine_similarity_word2vec(term1, term2):
    # Split multi-word terms into individual words
    terms1 = term1.lower().split()
    terms2 = term2.lower().split()

    # Keep words that are in vocabulary
    if STEMMER:
        valid_terms1 = [stemmer.stem(term) for term in terms1 if stemmer.stem(term) in w2v_model.key_to_index]
        valid_terms2 = [stemmer.stem(term) for term in terms2 if stemmer.stem(term) in w2v_model.key_to_index]
    else:
        valid_terms1 = [term for term in terms1 if term in w2v_model.key_to_index]
        valid_terms2 = [term for term in terms2 if term in w2v_model.key_to_index]

    # If one of valid terms are empty, then return 0.5
    if not valid_terms1 or not valid_terms2:
        return 0.5

    # Normalise embeddings by calculating mean of valid terms
    embedding_term1 = np.mean([w2v_model[term] for term in valid_terms1], axis=0)
    embedding_term2 = np.mean([w2v_model[term] for term in valid_terms2], axis=0)

    # Return cosine similarity
    return cosine_similarity([embedding_term1], [embedding_term2])[0, 0]

## Method C - **BERT** model
---

In [None]:
# Hyperparameters
MODEL_NAME = "bert-large-uncased-whole-word-masking"
TRUNCATE = True
MAX_LENGTH = 200

In [None]:
# Import libraries from PyTorch and Hugging Face
from transformers import AutoTokenizer, AutoModel

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm


Define the function which will calculate the cosine similarity between two input terms using the pre-trained model and it's generated embeddings.

In [None]:
# Function which calculates cosine similarity using the pretrained model for two inputs terms
def calculate_cosine_similarity_bert(term1, term2):
    # Tokenize the terms and obtain BERT embeddings
    inputs_tokens1 = tokenizer(term1.lower(), return_tensors="pt", truncation=TRUNCATE, padding='max_length', max_length=MAX_LENGTH)
    inputs_tokens2 = tokenizer(term2.lower(), return_tensors="pt", truncation=TRUNCATE, padding='max_length', max_length=MAX_LENGTH)

    with torch.no_grad():
        # Obtain embeddings for tokenised terms in last hidden state
        embeddings_term1 = bert_model(**inputs_tokens1)['last_hidden_state']
        embeddings_term2 = bert_model(**inputs_tokens2)['last_hidden_state']

    # Average all the token embeddings to get a single embedding for the whole term
    avg_term1 = embeddings_term1.mean(dim=1).squeeze().numpy()
    avg_term2 = embeddings_term2.mean(dim=1).squeeze().numpy()

    # Return cosine similarity
    return cosine_similarity([avg_term1], [avg_term2])[0, 0]

## Evaluation Functions
---

In [None]:
# Function which reads pair words in evaluation dataset, then calculates
# cosine similarity between them using the correct model, and outputs the results to a .csv file
def evaluate(method, dataset, test=False):
    # Evaluation for task 1
    start_time = time.time()

    # Create new dataframe from ID
    output = pd.DataFrame(dataset[0])

    # Create similarities column initialised with 0.5 column
    output['similarity'] = 0.5

    # Iterate through the dataset and calculate the cosine similarity between word pairs
    for index, row in dataset.iterrows():
        # Get the word pairs
        term1, term2 = row[1], row[2]

        # Choose the model
        if method == "b":
            similarity = calculate_cosine_similarity_word2vec(term1, term2)
        else:
            similarity = calculate_cosine_similarity_bert(term1, term2)

        # Store similarity in the output
        output.loc[index, 'similarity'] = similarity

    # Create path name and .csv file
    if test:
        output_path = f"{OUTPUT_FOLDER}/{USERNAME}-Task1-method-{method}.csv"
    else:
        output_path = f"{OUTPUT_FOLDER}/{USERNAME}-Task1-method-{method}-validation.csv"
    output.to_csv(output_path, header=False, index=False)

    print(f"Time elapsed to evaluate model: {measure_time(start_time)} seconds")
    return output_path

## Evaluation on Validation Set
---

Evaluation and accuracy result for 'Method B - **Word2vec** model' on the validation set.

In [None]:
# Calculate cosine similarities for validation set and run marking script
output_path_b = evaluate(method="b", dataset=validation_set)
!python task1_eval_script_student_version.py {output_path_b} {VALIDATION_PATH}

Time elapsed to evaluate model: 0.09208345413208008 seconds
Accuracy: 0.6555555555555556


Evaluation and accuracy result for 'Method C - Pre-trained **BERT** model' on the validation set.

In [None]:
# Calculate cosine similarities for validation set and run marking script
output_path_c = evaluate(method="c", dataset=validation_set)
!python task1_eval_script_student_version.py {output_path_c} {VALIDATION_PATH}

Time elapsed to evaluate model: 554.2399432659149 seconds
Accuracy: 0.6222222222222222


## Evaluation on Test Set
---

Evaluation and accuracy result for 'Method B - **Word2vec** model' on the test set.

In [None]:
# Calculate cosine similarities for test set
evaluate(method="b", dataset=test_set, test=True)

Time elapsed to evaluate model: 0.06305837631225586 seconds


'data/10895316-Task1-method-b.csv'

Evaluation and accuracy result for 'Method C - **Pre-trained** model' on the test set.

In [None]:
# Calculate cosine similarities for test set
evaluate(method="c", dataset=test_set, test=True)

Time elapsed to evaluate model: 375.5004234313965 seconds


'data/10895316-Task1-method-c.csv'