# COMP34711 Natural Language Processing – Task 1: Distributional semantics

## Setup

In [8]:
import numpy as np
import pandas as pd
import subprocess

from sklearn.metrics.pairwise import cosine_similarity

In [9]:
# from google.colab import drive
# drive.mount('/content/drive')

# Specify the folder path
# Make sure that your code uses the following relative path to a folder with the dataset(s): ./data/
# folder_path = '/content/drive/My Drive/NLP Coursework'
folder_path = "./data"

In [10]:
training_data = pd.read_csv(folder_path + "/Training-dataset.csv")
training_data = training_data[['ID', 'title', 'plot_synopsis']]
training_data

Unnamed: 0,ID,title,plot_synopsis
0,8f5203de-b2f8-4c0c-b0c1-835ba92422e9,Si wang ta,"After a recent amount of challenges, Billy Lo ..."
1,6416fe15-6f8a-41d4-8a78-3e8f120781c7,Shattered Vengeance,"In the crime-ridden city of Tremont, renowned ..."
2,4979fe9a-0518-41cc-b85f-f364c91053ca,L'esorciccio,Lankester Merrin is a veteran Catholic priest ...
3,b672850b-a1d9-44ed-9cff-025ee8b61e6f,Serendipity Through Seasons,"""Serendipity Through Seasons"" is a heartwarmin..."
4,b4d8e8cc-a53e-48f8-be6a-6432b928a56d,The Liability,"Young and naive 19-year-old slacker, Adam (Jac..."
...,...,...,...
8252,bc66257b-19bf-4ba2-b287-77b54ee0a205,I Went Down,After serving an eight month sentence for brea...
8253,9f8abad3-5a67-4fb1-a6d2-9fdff2d6c6f8,Scooby-Doo! Stage Fright,The Mystery Inc. crew head to Chicago for a ta...
8254,23837cc8-1503-4265-b793-bf60306d1a5c,Another Life,"Through its run, Another Life revolved around ..."
8255,30a2f2c2-64c8-4d29-a7c4-ae1c14eb51b0,The Ward,At the North Bend Psychiatric Hospital in 1966...


In [11]:
validation_data = pd.read_csv(folder_path + "/Task-1-validation-dataset.csv", header=None)

In [12]:
testing_data = pd.read_csv(folder_path + "/Task-1-test-dataset1.csv", header=None)

In [13]:
# Helper function to get accuracy based on each method
def eval_script_get_accuracy(filename):
  results = subprocess.run(f"python3 {folder_path}/task1_eval_script_student_version.py {folder_path}/{filename} {folder_path}/Task-1-validation-dataset.csv", capture_output=True, text=True, shell=True)
  accuracy = results.stdout.strip().split()[-1]
  return accuracy

## Method b) a dense static representation (word2vec)

In [14]:
import gensim
import gensim.downloader
from gensim.models import Word2Vec

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from gensim.models.callbacks import CallbackAny2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Preprocessing training data

I have done some preprocessing on the training data, such as:
- Case folding
- Tokenization
- Removing stop words
- Lemmatization

However, after experimenting with the Word2Vec model with various combination of preprocessing, I have found out that lemmatizing the text results in lower accuracy, which is why I will not add this step on my preprocessing.

I decided to not do any stemming because it will "cut" the words too much such that it will not retain the original word, which collides with the purpose of this task is to get the original word's representation.

In [15]:
stop_words = set(stopwords.words('english'))

def preprocess_text(document):
  # Convert to lowercase / case folding
  document = document.lower()
  # Remove punctuation (except for apostrophes)
  processed_sentence = ''.join(char if char.isalnum() or char == "'" else ' ' for char in document)
  # Tokenize words
  processed_sentence = nltk.word_tokenize(processed_sentence)
  # Remove stop words
  processed_sentence = [w for w in processed_sentence if not w.lower() in stop_words]
  # Lemmatization -> lowers accuracy so will be commented out
  # lemmatizer = WordNetLemmatizer()
  # processed_sentence = [lemmatizer.lemmatize(w) for w in processed_sentence]
  return processed_sentence

In [16]:
processed_sentences = []
for synopsis in training_data['plot_synopsis']:
  processed_sentences.append(preprocess_text(synopsis))

### Hyperparameter tuning / selection



The paper titled *Improving word embedding quality with innovative automated approaches to hyperparameters* by Daniel W. A. Buchan,
David T. Jones (2019) stated:

 "We observed that the vector size ranging from 50 to 300 affects the classification accuracy by 2% and we found the best value as 250."

 and,

 "..**smaller window size** tends to learn more about functional and synonymic features that can lead to better performance in **similarity** measurements.."

Hence, I have done hyperparameter selection on the following parameters using Grid Search to search for the best combination:
- vector_size = [50, 100, 150, 200, 250, 300]
- min_count = [1,2,3,8]
- window = [2,3,4,8]
- hierarchical sampling or negative sampling
- subsampling = [1e-03, 1e-05]
- Preprocessed data or none
- CBow or Skipgram

The results of my hyperparameter tuning shows that the best hyperparameters are:
- vector_size = 300
- min_count = 2
- window = 3 (around the same performance with 4)
- hierarchical sampling
- subsampling = 1e-03
- Skipgram

Also, after numerous trials, I can say that the best number of epochs to train this model is around 3 - 4, which I will choose 3 for the sake of training speed.

### Model Training

In [17]:
# Define hyperparameters
NUM_EPOCHS = 3
VECTOR_SIZE = 300
MIN_COUNT = 2
WINDOW = 3
HS = 1
SKIPGRAM = 1
NEGATIVE = 0
SUBSAMPLING = 1e-03

In [18]:
def compute_similarity(model, word1, word2):
  ''' Given 2 pairs of words/phrases, compute the cosine similarity between them using the given model's embedding '''

  def get_embedding(word):
    return model.wv[word] if word in model.wv.key_to_index else np.zeros(model.wv.vector_size) # Handle OOV words

  # Split multi-word phrases into individual words
  words1 = word1.split()
  words2 = word2.split()

  embeddings1 = np.sum([get_embedding(word) for word in words1], axis=0)
  embeddings2 = np.sum([get_embedding(word) for word in words2], axis=0)

  return cosine_similarity([embeddings1], [embeddings2]).item()

def save_similarities_results_w2v(model, target_data, filename):
  ''' Save the similarities result into a file '''
  computed_similarities = []

  for i in range(len(target_data)):
    id = target_data.iloc[i,0]
    word1 = target_data.iloc[i,1]
    word2 = target_data.iloc[i,2]
    similarity = compute_similarity(model, word1, word2)
    computed_similarities.append((id, similarity))

  with open(f'{folder_path}/{filename}', 'w') as f:
    for item in computed_similarities:
        f.write(f"{item[0]}, {item[1]}\n")

In [19]:
class CallbackWithHistory(CallbackAny2Vec):
  '''Callback to print loss and saves accuracy after each epoch'''

  def __init__(self):
    self.epoch = 0
    self.history = {'epoch': [], 'accuracy': []}

  def on_epoch_end(self, model):
    save_similarities_results_w2v(model, validation_data, "results-task1B.txt")
    accuracy = float(eval_script_get_accuracy("results-task1B.txt"))
    print(f'Accuracy after epoch {self.epoch}: {accuracy}')
    self.history['epoch'].append(self.epoch)
    self.history['accuracy'].append(accuracy)
    self.epoch += 1

In [33]:
## Instantiate and train model
## Training preprocessed data (without lemmatization) on Word2Vec model with GloVe-initialised weights

# Instantiate model
callback_instance = CallbackWithHistory()
model = Word2Vec(vector_size=VECTOR_SIZE, window=WINDOW, min_count=MIN_COUNT, sg=SKIPGRAM, hs=HS, negative=NEGATIVE, sample=SUBSAMPLING, seed=89)
model.build_vocab(processed_sentences)
model_new_vocab, total_examples = list(model.wv.key_to_index.keys()), model.corpus_count

# Initialise GloVe to handle OOV
glove_path = f"glove-wiki-gigaword-{VECTOR_SIZE}"
pretrained_model = gensim.downloader.load(glove_path)
model.build_vocab([list(pretrained_model.key_to_index.keys())], update=True)

# Initialise Word2Vec model with GloVe embedding weights so trains faster
pretrained_embeddings = gensim.downloader.load(glove_path, return_path=True)
model.wv.vectors_lockf = np.ones(len(model.wv))
model.wv.intersect_word2vec_format(pretrained_embeddings, binary=False, lockf=1.0)

# Training
model.train(processed_sentences, total_examples=total_examples, callbacks=[callback_instance], epochs=NUM_EPOCHS)

Accuracy after epoch 0: 0.625925925925926
Accuracy after epoch 1: 0.625925925925926
Accuracy after epoch 2: 0.6407407407407407


(11562660, 11880564)

In [21]:
# Save model
model.save(folder_path + "/task1_w2v_best.model")

### Evaluation

In [22]:
# Load model
model = Word2Vec.load(folder_path + "/task1_w2v_best.model")

On Validation set

In [34]:
# Evaluating on validation set
val_filename = '/10967103-Task1-method-b-validation.csv'
save_similarities_results_w2v(model, validation_data, val_filename)
eval_script_get_accuracy(val_filename)

'0.6407407407407407'

On testing set

In [35]:
# Generating results on testing set
test_filename = "/10967103-Task1-method-b.csv"
save_similarities_results_w2v(model, testing_data, test_filename)

## Method c) a pre-trained contextual representation (either BERT or RoBERTa)

### Install libraries

In [25]:
!pip install transformers

import torch
from transformers import AutoTokenizer, AutoModel



### Trying out different pre-trained models from Hugging Face

I have tried out various pre-trained models from Hugging Face, such as:
- 'bert-base-uncased'
- 'bert-base-cased'
- 'bert-large-uncased'
- 'roberta-base'
- 'roberta-large'
- 'roberta-large-mnli'
- 'bert-large-uncased-whole-word-masking'
- 'bert-large-cased-whole-word-masking'
- 'princeton-nlp/sup-simcse-roberta-large'
- 'thatdramebaazguy/movie-roberta-base'
- 'planeB/roberta_movie_w_title'
- 'thatdramebaazguy/movie-roberta-MITmovie'

The results show that *princeton-nlp/sup-simcse-roberta-large* model gives the best result / accuracy.

You can refer to this model in more detail here: https://github.com/princeton-nlp/SimCSE

Or in this paper: https://arxiv.org/abs/2104.08821

### Loading Pretrained Model Contextual Representation

In [26]:
def save_similarities_pretrained_model(tokenizer, model, filename, target_data=validation_data):
  computed_similarities = []

  for i in range(len(target_data)):
    id = target_data.iloc[i,0]
    word1 = target_data.iloc[i,1]
    word2 = target_data.iloc[i,2]

    inputs1 = tokenizer(word1, padding=True, truncation=True, return_tensors="pt")
    inputs2 = tokenizer(word2, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
      word_embedding1 = model(**inputs1, output_hidden_states=True, return_dict=True).pooler_output
      word_embedding2 = model(**inputs2, output_hidden_states=True, return_dict=True).pooler_output
      similarity = cosine_similarity(word_embedding1, word_embedding2).item()

      computed_similarities.append((id, similarity))

  with open(f'{folder_path}/{filename}', 'w') as fp:
    for item in computed_similarities:
        fp.write(f"{item[0]}, {item[1]}\n")

In [27]:
tokenizer = AutoTokenizer.from_pretrained('princeton-nlp/sup-simcse-roberta-large')
model = AutoModel.from_pretrained('princeton-nlp/sup-simcse-roberta-large')

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

### Evaluation


In [28]:
# Evaluating on validation set
save_similarities_pretrained_model(tokenizer, model, "10967103-Task1-method-c-validation.csv", validation_data)
eval_script_get_accuracy("10967103-Task1-method-c-validation.csv")

'0.7333333333333334'

In [29]:
# Generating results for test dataset
save_similarities_pretrained_model(tokenizer, model, "10967103-Task1-method-c.csv", testing_data)