# Computing similarity between terms

#  1. Set up Environment

## Required Imports

In [1]:
import pandas as pd

import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import Word2Vec

from tqdm import tqdm

from enum import Enum

from random import shuffle

from collections import defaultdict

from time import time

from datetime import timedelta

## Required NLTK Packages

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\klouc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\klouc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\klouc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Constants

In [3]:
CWD = "./data"
TRAINING_DATA = 'Training-dataset.csv'
VALIDATION_DATA = 'Validation-dataset-terms.csv'
TEST_DATA = 'Test-dataset-terms.csv'

PLOT_SYNOPSIS = 'plot_synopsis'
PREDICTION = 'prediction'

Method = Enum('Method', ['PPMI', 'WORD2VEC'])
Data = Enum('Type', ['VALIDATION', 'TEST'])

# 2. Load in Data

## Training Data

In [4]:
train_data = pd.read_csv(f"{CWD}/{TRAINING_DATA}")
print(f'Training movie data count: {len(train_data.values)}')

train_data.dropna(inplace=True)
train_data.reset_index(inplace=True, drop=True)
print(f'Training movie data count after NaN check: {len(train_data.values)}')

train_data.head(10)

Training movie data count: 8257
Training movie data count after NaN check: 8257


Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,8f5203de-b2f8-4c0c-b0c1-835ba92422e9,Si wang ta,"After a recent amount of challenges, Billy Lo ...",0,0,0,0,1,1,0,0,1
1,6416fe15-6f8a-41d4-8a78-3e8f120781c7,Shattered Vengeance,"In the crime-ridden city of Tremont, renowned ...",0,0,0,0,1,1,1,0,1
2,4979fe9a-0518-41cc-b85f-f364c91053ca,L'esorciccio,Lankester Merrin is a veteran Catholic priest ...,0,1,0,0,0,0,0,0,0
3,b672850b-a1d9-44ed-9cff-025ee8b61e6f,Serendipity Through Seasons,"""Serendipity Through Seasons"" is a heartwarmin...",0,0,0,0,0,0,1,0,0
4,b4d8e8cc-a53e-48f8-be6a-6432b928a56d,The Liability,"Young and naive 19-year-old slacker, Adam (Jac...",0,0,1,0,0,0,0,0,0
5,2fbcdf4a-1c10-4958-a175-985d226f5906,Savage Vengance,Katie Carter (Dallender) is an aspiring model ...,0,0,0,0,1,0,0,0,0
6,60298c01-41d0-4e12-a203-b5a1feb78943,The Snake Pit,Virginia Cunningham (Olivia de Havilland) is a...,0,0,1,0,0,0,0,0,0
7,084f6cc3-e4e2-4f1a-bcbb-b26dbdd2762f,Shadows of Betrayal,In the dark and gritty underbelly of a sprawli...,0,0,0,0,1,0,0,0,1
8,a198118a-564e-42f8-8df2-0cbec828aa2f,Kakushi ken oni no tsume,"The film takes place in Japan in the 1860s, a ...",0,0,0,0,1,1,0,0,0
9,541bbc68-5628-43a3-9f83-49c7900c2e57,Intolerable Cruelty,Donovan Donaly (Geoffrey Rush) a TV soap opera...,1,0,0,0,0,1,1,0,0


## Validation Data

In [5]:
val_data = pd.read_csv(f"{CWD}/{VALIDATION_DATA}", header=None)
print(f'Validation movie data count: {len(val_data.values)}')

val_data.dropna(inplace=True)
val_data.reset_index(inplace=True, drop=True)
print(f'Validation movie data after NaN check count: {len(val_data.values)}')

val_data.head(10)

Validation movie data count: 150
Validation movie data after NaN check count: 150


Unnamed: 0,0,1,2,3
0,1,absorb,learn,5.48
1,2,absorb,withdraw,2.97
2,3,achieve,accomplish,8.57
3,4,achieve,try,4.42
4,6,acquire,get,8.82
5,7,acquire,obtain,8.57
6,8,acquire,find,6.38
7,11,apple,sauce,1.43
8,12,apple,lemon,4.05
9,13,apple,sunshine,0.58


## Test Data

In [6]:
test_data = pd.read_csv(f"{CWD}/{TEST_DATA}", header=None)
print(f'Test movie data count: {len(test_data.values)}')

test_data.dropna(inplace=True)
test_data.reset_index(inplace=True, drop=True)
print(f'Test movie data after NaN check count: {len(test_data.values)}')

test_data.head(10)

Test movie data count: 102
Test movie data after NaN check count: 102


Unnamed: 0,0,1,2
0,816,accept,acknowledge
1,957,accept,recommend
2,809,agree,argue
3,911,agree,please
4,242,alcohol,cocktail
5,697,alcohol,wine
6,2066,announcement,news
7,2164,announcement,effort
8,14,bad,terrible
9,51,bad,great


## Extract Plot Synopses

In [7]:
plot_synopses = train_data[PLOT_SYNOPSIS]
plot_synopses.head(10)

0    After a recent amount of challenges, Billy Lo ...
1    In the crime-ridden city of Tremont, renowned ...
2    Lankester Merrin is a veteran Catholic priest ...
3    "Serendipity Through Seasons" is a heartwarmin...
4    Young and naive 19-year-old slacker, Adam (Jac...
5    Katie Carter (Dallender) is an aspiring model ...
6    Virginia Cunningham (Olivia de Havilland) is a...
7    In the dark and gritty underbelly of a sprawli...
8    The film takes place in Japan in the 1860s, a ...
9    Donovan Donaly (Geoffrey Rush) a TV soap opera...
Name: plot_synopsis, dtype: object

## Helper Functions

In [8]:
def get_cosine_similarity(first_term, second_term, method):
  """
  Get the cosine similarity between two terms (can be multi-worded) by first
  creating vector representations for them using one of the pre-defined methods
  (PPMI or word2vec).

  Parameters
  ----------
  first_term : str
      The first term, if multi-worded the different words are separated by
      spaces.
  second_term : str
      The second term, if multi-worded the different words are separated by
      spaces.
  method : Method
      The method with which to vectorize the terms, can only be word2vec or
      PPMI.

  Returns
  -------
  float
      The cosine similarity between the two vectors that represent the two
      terms.
  """
  # Validate requested method
  try:
    if(method not in Method):
      return None
  except:
    print("Please choose one of the valid methods (PPMI or WORD2VEC).")

  # Get vector representations of terms using specified method
  if(method == Method.PPMI):
    first_term_vector = ppmi_vectorizer.get_term_vector(first_term)
    second_term_vector = ppmi_vectorizer.get_term_vector(second_term)
  elif(method == Method.WORD2VEC):
    first_term_vector = word2vec_vectorizer.get_term_vector(first_term)
    second_term_vector = word2vec_vectorizer.get_term_vector(second_term)
  else:
    print("Please choose one of the valid methods (PPMI or WORD2VEC).")
    return None

  # Reshape into correct matrix format for cosine_similarity function
  first_term_vector = first_term_vector.reshape(1, -1)
  second_term_vector = second_term_vector.reshape(1, -1)

  # Get similarity score and extract value from (1, 1)-sized matrix
  similarity_score = cosine_similarity(first_term_vector,
                                      second_term_vector)

  return similarity_score[0, 0]


def get_comparison_with_csv(method, data):
  """
  Compute the cosine similarities between all term pairs within a csv file.

  Parameters
  ----------
  method : Method
      The method with which to vectorize the terms, can only be word2vec or
      PPMI.
  data : Data
      The type of data to use when looking for which csv to extract term pairs
      from. Can only be from the validation or test data.

  Returns
  -------
  pandas.DataFrame
      A DataFrame object that contains all cosine similarities between each
      term pair by creating the vectors that represent each associated term.
  """
  # Validate requested method
  try:
    if(method not in Method):
      return None
  except:
    print("Please choose one of the valid methods (PPMI or WORD2VEC).")

  # Validate requested data file
  try:
    if(data not in Data):
      return None
  except:
    print("Please choose one of the valid datasets (VALIDATION or TEST).")

  # Set data to compute similarity for
  comparison_data = val_data
  if(data == Data.TEST):
    comparison_data = test_data

  start_time = time()
  comparison_data[PREDICTION] = comparison_data.apply(lambda row:
                                                        get_cosine_similarity(
                                                            row[1],
                                                            row[2],
                                                            method),
                                                        axis=1)
  end_time = time()

  # Calculate elapsed time
  elapsed_time = str(timedelta(seconds=end_time - start_time))
  print(f"Time taken to compute similarities for given data: " +
  f"{str(elapsed_time)[elapsed_time.find(':') + 1:]}")

  return comparison_data[[0, PREDICTION]]


# 3. Implement Methods

## METHOD A) Sparse Representation
### PPMI (Bag of Words with Positive Pointwise Mutual Information)

### Impement and initialise PPMI class

In [9]:
class PPMIVectorizer():
  """
  A custom class for PPMI that allows to dynamically generate sparse vector
  representations for terms (including multi-word terms) by going through
  a corpus with a specific context window length hyperparameter.

  Attributes
  ----------
  term_term_count : defaultdict of str to int
      A dictionary of tuple detailing the count of how often a ceratin term
      appears in an another term's context depending on the context window
      length.
  term_sums : defaultdict of tuple of str and str to int
      The occurrence sums for each term.
  total_occurrence_sum : int
      A sum of all term occurrences.
  vocabulary : set of str
      A set of unique words within the preprocessed corpus.
  smoother : float
      Small value to apply smoothing to vectors.

  Methods
  -------
  preprocess_corpus(corpus)
      Preprocess a given corpus and return a list of tokenised sentences.
  count_terms_with_context(corpus, context_window_len)
      Populate PPMI's signature attributes using the context window length.
  get_term_vector(term)
      Dynamically compute the vector representation of a term (can be multi-
      worded).

  """
  def __init__(self):
    """
    Initialize the PPMIVectorizer object.
    """
    self.term_term_count = defaultdict(int)
    self.term_sums = defaultdict(int)
    self.total_occurrence_sum = 0
    self.vocabulary = set()
    self.smoother = 1e-8

  def preprocess_corpus(self, corpus):
    """
    Preprocess a given corpus and return a list of tokenised sentences.

    Parameters
    ----------
    corpus : pandas.Series of str
        Series of strings containing all documents which need to be
        preprocessed.

    Returns
    -------
    list of list of str
        A list of lists of strings that are the tokenised words from the
        corpus.
    """
    word_tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer()
    preprocessed_sentences = []

    print('Processing texts...')
    for text in tqdm(corpus):
      # Tokenize sentences and apply lower case
      sents = sent_tokenize(text)
      for sent in sents:
        # Tokenise words
        words = word_tokenizer.tokenize(sent)
        # Tag POS and convert from penn to wordnet schema
        words_tagged = pos_tag(words)
        words_tagged_wn = [(word_tag_pair[0], 'a') if word_tag_pair[1][0].lower() == 'j' else (word_tag_pair[0], word_tag_pair[1][0].lower()) if word_tag_pair[1][0].lower() in ['n', 'r', 'v'] else (word_tag_pair[0], 'n') for word_tag_pair in words_tagged]
        # Lemmatize each word
        words = [lemmatizer.lemmatize(word_tag_pair[0], word_tag_pair[1]) for word_tag_pair in words_tagged_wn]
        # If sentence too short no context available, skip
        if(len(words)) < 2:
          continue

        preprocessed_sentences.append(words)
    return preprocessed_sentences

  def count_terms_with_context(self, corpus, context_window_len=1):
    """
    Populate PPMI's signature attributes using the context window length.

    Parameters
    ----------
    corpus : pandas.Series of str
        Series of strings containing all documents.
    context_window_len : int
        The length of the context window which would indicate if a term falls
        within another term's context.
    """
    word_tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer()

    preprocessed_sentences = self.preprocess_corpus(corpus)

    for tokenised_sent in preprocessed_sentences:
      # Get subsequent tokens within context window
      for i in range(len(tokenised_sent)):
          curr_word = tokenised_sent[i]
          self.vocabulary.add(curr_word)
          subsequent_tokens = tokenised_sent[i+1 : i+1+context_window_len]
          for subsequent_token in subsequent_tokens:
              key = tuple(sorted([subsequent_token, curr_word]))
              self.total_occurrence_sum += 2
              self.term_term_count[key] += 1
              self.term_sums[curr_word] += 1
              self.term_sums[subsequent_token] += 1

    # Sort vocabulary
    self.vocabulary = sorted(self.vocabulary)
    print(f'Vocabulary count: {len(self.vocabulary)}')


  def get_term_vector(self, term):
    """
    Dynamically compute the vector representation of a term (can be multi-
    worded). To get the multi-word term's vector a mean of all the vectors
    of the words that make up the term is computed.

    Parameters
    ----------
    corpus : term
        The term that needs to be vectorized (can be multi-worded).

    Returns
    -------
    numpy.ndarray of floats
        A sparse vector representation of a term (can be muti-worded).
    """
    term_vector_arr = []

    # Split term into words in case it's a multi-word term
    words = term.split(' ')
    for word in words:
      word_vector_arr = []
      # Get occurrence sum for term of interest
      term_sum = self.term_sums[word]

      # Create term vector using term occurrence of all terms
      for dict_term in self.vocabulary:
        curr_term_sum = self.term_sums[dict_term]
        curr_term_and_term_val = self.term_term_count[(word, dict_term)] + \
                                  self.term_term_count[(dict_term, word)]
        ppmi_val = 0
        if(curr_term_and_term_val == 0):
          ppmi_val = 0
        else:
          # Ignore divide by 0 warning
          with np.errstate(divide='ignore'):
            ppmi_val = np.log((curr_term_and_term_val /
                                self.total_occurrence_sum) / \
                              ((term_sum / self.total_occurrence_sum) * \
                                (curr_term_sum / self.total_occurrence_sum)))
            if(ppmi_val < 0):
              ppmi_val = 0
        word_vector_arr.append(ppmi_val + self.smoother)
      term_vector_arr.append(np.array(word_vector_arr))
    term_vector_arr = np.array(term_vector_arr)

    if(len(words) == 1):
      term_vector_arr = term_vector_arr.reshape(term_vector_arr.shape[1],)
    else:
      term_vector_arr = np.mean(term_vector_arr, axis=0)

    return np.array(word_vector_arr)

Initialise PPMI class with corpus and context window

In [None]:
start_time = time()

ppmi_vectorizer = PPMIVectorizer()
ppmi_vectorizer.count_terms_with_context(plot_synopses,
                                         context_window_len=2)

end_time = time()

# Calculate elapsed time
elapsed_time = str(timedelta(seconds=end_time - start_time))
print(f"Elapsed Time: {str(elapsed_time)[elapsed_time.find(':') + 1:]}")

### Compute Similarities Between Terms

Compute similarities in validation data

In [None]:
predictions_ppmi_val = get_comparison_with_csv(Method.PPMI,
                                               Data.VALIDATION)

Time taken to compute similarities for given data: 01:08.376483


Compute similarities in test data

In [None]:
predictions_ppmi_test = get_comparison_with_csv(Method.PPMI,
                                                Data.TEST)


Time taken to compute similarities for given data: 00:51.243617


### Save Results

In [None]:
predictions_ppmi_val.to_csv('ppmi_predictions_validation.csv',
                            header=False,
                            index=False)

predictions_ppmi_test.to_csv('ppmi_predictions_test.csv',
                             header=False,
                             index=False)

In [None]:
!python3 scripts/term_similarity_eval_script.py ppmi_predictions_validation.csv data/Validation-dataset-terms.csv

## METHOD B) Dense Static Representation
### Word2Vec with Skip-gram

### Impement and Initialise Word2Vec Class

In [None]:
class Word2vecVectorizer():
  """
  A custom class for word2vec that allows to generate dense vector
  representations for terms (including multi-word terms) trained on a corpus.

  Attributes
  ----------
  model : gensim.Word2Vec
      A Word2Vec instance that holds all vector representations.

  Methods
  -------
  preprocess_corpus(corpus)
      Preprocess a given corpus and return a list of tokenised sentences.
  create_model(corpus, context_window_len, vector_size, sg, epochs)
      Train word2vec on corpus with chosen hyperparameters.
  get_term_vector(term)
      Dynamically compute the vector representation of a term (can be multi-
      worded).

  """
  def __init__(self):
    self.model = None

  def preprocess_corpus(self, corpus):
    """
    Preprocess a given corpus and return a list of tokenised sentences.

    Parameters
    ----------
    corpus : list of str
        Series of strings containing all documents which need to be
        preprocessed.

    Returns
    -------
    list of list of str
        A list of lists of strings that are the tokenised words from the
        corpus.
    """
    word_tokenizer = RegexpTokenizer(r'\w+')
    tokenized_sentences = []
    for text in tqdm(corpus):
        # Tokenize sentences
        sents = sent_tokenize(text.lower())
        for sent in sents:

          # Tokenise each word
          words = word_tokenizer.tokenize(sent)

          tokenized_sentences.append(words)
    return tokenized_sentences

  def create_model(self,
                   corpus,
                   context_window_len=3,
                   vector_size=200,
                   sg=1,
                   epochs=5):
    """
    Train word2vec on corpus with chosen hyperparameters.
    get_term_vector(term).

    Parameters
    ----------
    corpus : pandas.Series of str
        Series of strings containing all documents which will be used in
        training the word2vec model.
    context_window_len : int
        The length of the context window which would indicate if a term falls
        within another term's context.
    vector_size : int
        The dimensionality of the resulted vector representation of a term.
    sg : int
        Flag that indicated to either use CBOW or Skip-gram to train the
        word2vec model. Use 0 for the former, 1 for the latter.
    epochs : int
        Count of epochs for which the model will be trained.
    """
    # Train Word2Vec model

    self.model = Word2Vec(sentences=self.preprocess_corpus(corpus),
                          corpus_file=None,
                          vector_size=vector_size,
                          min_alpha=0.0001,
                          alpha=0.025,
                          window=context_window_len,
                          min_count=5,
                          max_vocab_size=None,
                          sample=0.001,
                          seed=1,
                          workers=3,
                          sg=sg,
                          hs=0,
                          negative=5,
                          ns_exponent=0.75,
                          cbow_mean=1,
                          epochs=epochs,
                          null_word=0,
                          trim_rule=None,
                          sorted_vocab=1,
                          batch_words=10000,
                          compute_loss=False,
                          callbacks=(),
                          comment=None,
                          max_final_vocab=None,
                          shrink_windows=True)

  def get_term_vector(self, term):
    """
    Retrieve the vector representation of a term (can be multi-
    worded). To get the multi-word term's vector a mean of all the vectors
    of the words that make up the term is computed.

    Parameters
    ----------
    corpus : term
        The term that needs to be vectorized (can be multi-worded).

    Returns
    -------
    numpy.ndarray of floats
        A dense vector representation of a term (can be muti-worded).
    """
    term_vector_arr = []

    # Split term into words in case it's a multi-word term
    words = term.split(' ')
    for word in words:
      word_vector = []
      if(word in self.model.wv):
        word_vector = self.model.wv[word]
      else:
        word_vector = np.zeros(200)
      term_vector_arr.append(np.array(word_vector))
    term_vector_arr = np.array(term_vector_arr)

    if(len(words) == 1):
      term_vector_arr = term_vector_arr.reshape(term_vector_arr.shape[1],)
    else:
      term_vector_arr = np.mean(term_vector_arr, axis=0)

    return np.array(term_vector_arr)

Initialise word2vec class with corpus, context window, vector dimensionality, implementation (CBOW or Skip-gram) and epochs

In [None]:
start_time = time()

word2vec_vectorizer = Word2vecVectorizer()
word2vec_vectorizer.create_model(plot_synopses,
                                 context_window_len=3,
                                 vector_size=200,
                                 sg=1,
                                 epochs=5)

end_time = time()

# Calculate elapsed time
elapsed_time = str(timedelta(seconds=end_time - start_time))
print(f"Elapsed Time: {str(elapsed_time)[elapsed_time.find(':') + 1:]}")

100%|██████████| 8257/8257 [00:11<00:00, 734.91it/s]


Elapsed Time: 02:28.520247


### Compute Similarities Between Terms

Compute similarities in validation data

In [None]:
predictions_word2vec_val = get_comparison_with_csv(Method.WORD2VEC,
                                                   Data.VALIDATION)

Time taken to compute similarities for given data: 00:00.047226


Compute similarities in test data

In [None]:
predictions_word2vec_test = get_comparison_with_csv(Method.WORD2VEC,
                                                    Data.TEST)

Time taken to compute similarities for given data: 00:00.032098


### Save Results

In [None]:
predictions_word2vec_val.to_csv('word2vec_predictions_validation.csv',
                            header=False,
                            index=False)

predictions_word2vec_test.to_csv('word2vec_predictions_test.csv',
                             header=False,
                             index=False)

In [None]:
!python3 scripts/term_similarity_eval_script.py word2vec_predictions_validation.csv data/Validation-dataset-terms.csv