In [15]:
import pandas as pd
import numpy as np
import unicodedata
import re
import string
import time

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix

from gensim.models import Word2Vec, KeyedVectors

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('names')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [16]:
training_dataset_path = r"./data/Training-dataset.csv"
validation_dataset_path = r"./data/Task-1-validation-dataset.csv"
test_dataset_path = r"./data/Task-1-test-dataset1.csv"

Text pre-processing for both models:

In [17]:
# Initialise the lemmatizer and the word sets needed for preprocessing.
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
english_vocab = set(words.words()) - stop_words
names = [name.lower() for name in nltk.corpus.names.words()]

def remove_accents(data):
  """ Removes accents from a word. """
  return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters or x == " ")

def process_text(text):
  """ A function for tokenization, stopwords & OOV words filtering, and lemmatization. """
  # Remove punctuation
  # replace dashes with ' ' and replace everything else with a ''.
  pattern = re.compile('[%s]' % re.escape(string.punctuation.replace('-', '')))
  text = re.sub(pattern, '', text.replace('-', ' '))

  # Tokenize
  tokens = word_tokenize(text)

  # Keep words only, remove extra whitespace, turn to lowercase, remove accents
  tokens = [remove_accents(token.strip().lower()) for token in tokens if token.isalpha()]

  # If a token is a name, change it to: '<NAME>'
  # If a token is OOV, change it to '<UKN>'
  # Otherwise, lemmatize it
  temp = []
  for token in tokens:
    if token in names:
        temp.append('<NAME>')
    else:
      if token in english_vocab:
        temp.append(lemmatizer.lemmatize(token))
      else:
        if token not in stop_words:
          temp.append('<UKN>')
  tokens = temp

  return tokens

In [18]:
# Load the training data
df = pd.read_csv(training_dataset_path, usecols=['title', 'plot_synopsis'])

# Pre-process the data
df['plot_synopsis'] = df['plot_synopsis'].apply(process_text)

**Model 1: a) A sparse representation with tf.idf**

In [19]:
def calculate_similarity_model1(phrase1, phrase2, vectorizer, tf_idf):
  """ Calculates the consine similarity between 2 phrases.

      It sums the vectors of the words in each phrase, then it returns
      the similarity between the summed vectors.
  """

  # Split the phrase
  phrase1 = phrase1.split()
  # Initialise a sparse matrix (it has 1 element - which is 0 - at (0,0))
  vector1 = coo_matrix(([0], ([0], [0])), shape=(1, tf_idf.shape[0]))

  for word in phrase1:
    # Apply the same pre-processing done to the training set to each word
    word = lemmatizer.lemmatize(word)
    if word in names:
      word = '<NAME>'
    if vectorizer.vocabulary_.get(word, -1) == -1:
      word = '<UKN>'

    # Add the current vector to the total vector
    index = vectorizer.vocabulary_.get(word, -1)
    vector1 += tf_idf[:, index].reshape(1, -1)


  phrase2 = phrase2.split()
  vector2 = coo_matrix(([0], ([0], [0])), shape=(1, tf_idf.shape[0]))

  for word in phrase2:
    word = lemmatizer.lemmatize(word)
    if word in names:
      word = '<NAME>'
    if vectorizer.vocabulary_.get(word, -1) == -1:
      word = '<UKN>'

    index = vectorizer.vocabulary_.get(word, -1)
    vector2 += tf_idf[:, index].reshape(1, -1)

  # Return the similarity
  return cosine_similarity(vector1, vector2)[0, 0]

In [20]:
def evaluate_model1(vectorizer, tf_idf, input_path, output_path, validation=True):
  """ Evaluates the model on a validation/test set.

      It does the following:
        1- Reads the input file
        2- Calculate the similarity between the given pairs.
        3- Store the results in the specified path.
  """

  if validation:
    column_headers = ['id', 'word1', 'word2', 'similarity']
  else:
    column_headers = ['id', 'word1', 'word2']

  # Get the validation data
  evaluation_df = pd.read_csv(input_path, header=None, names=column_headers)

  # Calculate the similarity for each 2 words in the validation set
  evaluation_df['similarity'] = evaluation_df.apply(lambda row: calculate_similarity_model1(row['word1'], row['word2'], vectorizer, tf_idf), axis=1)

  # Store the 'id' and 'similarity' to a csv file
  result_df = evaluation_df[['id', 'similarity']]
  result_df.to_csv(output_path, index=False, header=False)

In [21]:
# Join the tokens (to allow TfidfVectorizer to create a tf.idf matrix)
df['text'] = df['plot_synopsis'].apply(' '.join)

# Create a tf.idf for both unigrams and bigrams
vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(df['text'])

In [22]:
# Vocabulary length
len(vectorizer.get_feature_names_out())

25678

Validation dataset:

In [23]:
# Record the start time
start_time = time.time()

# Evaluate the model
evaluate_model1(vectorizer, tf_idf, validation_dataset_path, '10768356-Task1-method-a-validation.csv')

# Record the end time
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")

Time taken: 2.983353853225708 seconds


Test dataset:

In [24]:
# Record the start time
start_time = time.time()

# Evaluate the model
evaluate_model1(vectorizer, tf_idf, test_dataset_path, '10768356-Task1-method-a.csv', validation=False)

# Record the end time
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")

Time taken: 2.181042194366455 seconds


**Model 2: b) A dense static representation (Word2Vec)**

In [26]:
def calculate_similarity_model2(model, phrase1, phrase2):
  """ Calculates the consine similarity between 2 phrases.

      It uses the mean vector of the words in each phrase to
      calculate the similarity.
  """

  phrase1 = phrase1.split()
  vector1 = np.array(np.zeros(model.vector_size))
  for word in phrase1:
    # Apply the same pre-processing done to the training set to each word
    word = lemmatizer.lemmatize(word)
    if word in names:
      word = '<NAME>'
    if word not in model.wv.index_to_key:
      word = '<UKN>'

    # Append the current vector to the array
    vector1 = np.vstack((vector1, model.wv[word]))

  phrase2 = phrase2.split()
  vector2 = np.array(np.zeros(model.vector_size))
  for word in phrase2:
    word = lemmatizer.lemmatize(word)
    if word in names:
      word = '<NAME>'
    if word not in model.wv.index_to_key:
      word = '<UKN>'

    vector2 = np.vstack((vector2, model.wv[word]))

  # Calculate the mean vectors
  vector1 = vector1.mean(axis=0)
  vector2 = vector2.mean(axis=0)

  # Return the similarity
  return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0, 0]

In [27]:
def evaluate_model2(model, input_path, output_path, validation=True):
  """ Evaluates the model on a validation/test set.

      It does the following:
        1- Reads the input file
        2- Calculate the similarity between the given pairs.
        3- Store the results in the specified path.
  """

  if validation:
    column_headers = ['id', 'word1', 'word2', 'similarity']
  else:
    column_headers = ['id', 'word1', 'word2']

  # Get the validation data
  evaluation_df = pd.read_csv(input_path, header=None, names=['id', 'word1', 'word2', 'similarity'])

  # Calculate the similarity for each 2 words in the validation set
  evaluation_df['similarity'] = evaluation_df.apply(lambda row: calculate_similarity_model2(model, row['word1'], row['word2']), axis=1)

  # Store the 'id' and 'similarity' to a csv file
  result_df = evaluation_df[['id', 'similarity']]
  result_df.to_csv(output_path, index=False, header=False)

In [28]:
# Train the model.
word2vec_model = Word2Vec(sentences=df['plot_synopsis'],
                          vector_size=100,    # Dimensionality of the word vectors (embedding size)
                          window=10,          # Maximum distance between the current and predicted word within a sentence
                          sg=1,               # sg=1: Skip-gram model is used
                          epochs=5)           # Number of iterations over the dataset during training

Validation dataset:

In [29]:
# Record the start time
start_time = time.time()

# Evaluate the model
evaluate_model2(word2vec_model, validation_dataset_path, '10768356-Task1-method-b-validation.csv')

# Record the end time
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")

Time taken: 0.24012970924377441 seconds


Test dataset:

In [30]:
# Record the start time
start_time = time.time()

# Evaluate the model
evaluate_model2(word2vec_model, test_dataset_path, '10768356-Task1-method-b.csv', validation=False)

# Record the end time
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")

Time taken: 0.15571928024291992 seconds
