In [127]:
import pandas as pd
import nltk
import string
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
from scipy.sparse import coo_array, csr_array
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [128]:
vocab = set()
tokens = []

In [129]:
def get_vocab_tokens(text):
  row_tokens = word_tokenize(text)
  tokens.append(row_tokens)
  vocab.update(row_tokens)

In [130]:
pd.read_csv('data/Training-dataset.csv')['plot_synopsis'].apply(get_vocab_tokens)

0       None
1       None
2       None
3       None
4       None
        ... 
8252    None
8253    None
8254    None
8255    None
8256    None
Name: plot_synopsis, Length: 8257, dtype: object

In [131]:
vocab_list = sorted(vocab)
index_from_word = {word:index for (index, word) in enumerate(vocab_list)}
word_from_index = {index:word for (index, word) in enumerate(vocab_list)}

###Method A

In [132]:
def get_term_context_matrix(tokens, window_size):
  contextfreqs = defaultdict(int)

  # iterate through each token in each document
  for document in tokens:
    for i in range(len(document)):
      # iterate through each token in context (+/- window_size on each size)
      context_start = max(i-window_size, 0)
      context_stop = min(i+window_size+1, len(document))
      for context_i in range(context_start, context_stop):
        if context_i == i: continue # don't include the token of interest in the context
        # incremenet count for that term and context word
        contextfreqs[(index_from_word[document[i]], index_from_word[document[context_i]])] += 1

  # convert key tuples to two lists, one for row, one for column
  keys = np.array(list(contextfreqs.keys())).T.tolist()
  rows = keys[0]
  columns = keys[1]

  # create sparse term-context matrix
  return coo_array((list(contextfreqs.values()), (rows, columns)), shape=(len(vocab), len(vocab))).tocsr()

In [133]:
def calculate_ppmi(term_context, smoothing):
  # sum of every frequency
  smoothed_sum = term_context.sum() + (smoothing * term_context.shape[0] * term_context.shape[1])

  # normalise term-context matrix to get probabilities
  p_term_context = term_context.copy()
  p_term_context.data = (term_context.data + smoothing)/smoothed_sum

  # row and column probability sums with smoothing
  p_terms_sum = (term_context.sum(axis=1) + (smoothing * term_context.shape[1]))/smoothed_sum
  p_context_sum = (term_context.sum(axis=0) + (smoothing * term_context.shape[0]))/smoothed_sum

  # calculate ppmi
  ppmi_term_context = p_term_context.copy()
  ppmi_term_context.data = np.log2((p_term_context/(p_terms_sum * p_context_sum)).data) # calculate pmi values
  ppmi_term_context = ppmi_term_context.maximum(0)  # replace negative pmi values with 0 to get ppmi
  return ppmi_term_context

In [134]:
# calculate ppmi
window_size = 3
add_smoothing = 2

term_context = get_term_context_matrix(tokens, window_size)
ppmi_term_context = calculate_ppmi(term_context, add_smoothing)

In [135]:
def ppmi_vector(word):
  # tokenise in case input is actually multi-word
  word_tokens = [ppmi_term_context[[index_from_word[t]],:] for t in word_tokenize(word) if t in vocab]

  # if word_tokens is empty
  if not word_tokens:
    return None

  # sum the representation vectors
  return np.array(word_tokens).sum(axis=0)

In [136]:
def row_similarity(row, vector_func):
  id = row[0]

  # get the vector representations for each word
  word1 = vector_func(row[1])
  word2 = vector_func(row[2])

  if (word1 is not None) and (word2 is not None):
    similarity = cosine_similarity(word1, word2).item()
  else:
    similarity = 0.5
  return [id, similarity]

In [137]:
def get_results(inputf, outputf, vector_func, run_eval=False):
  # read in validation/test dataset
  input = pd.read_csv(inputf, header=None, usecols=[0, 1, 2])

  # calculate similarity for each word pair
  results = input.apply(row_similarity, axis=1, result_type='expand', vector_func=vector_func)
  results[0] = results[0].astype('int') # change ids back to ints

  # write results to csv
  results.to_csv(outputf, header=False, index=False)

  # run evaluation script
  if run_eval:
    !python data/task1_eval_script_student_version.py {outputf} {inputf}

In [138]:
get_results('data/Task-1-validation-dataset.csv', 'data/10491450-Task1-method-a-validation.csv', ppmi_vector, run_eval=True)

The following simalarity scores may need checking:
(acquire,obtain) similarity score: 0.17012102010353683, gold ranking: 8.57
(acquire,find) similarity score: 0.17115432656301724, gold ranking: 6.38
----------------------------
(apple,sauce) similarity score: 0.01886705895299194, gold ranking: 1.43
(apple,lemon) similarity score: 0.010063297230987296, gold ranking: 4.05
----------------------------
(area,region) similarity score: 0.20092632914782163, gold ranking: 9.47
(area,corner) similarity score: 0.29451624292394335, gold ranking: 2.07
----------------------------
(arm,shoulder) similarity score: 0.41052417978217554, gold ranking: 4.85
(arm,body) similarity score: 0.46262796295190506, gold ranking: 4.05
----------------------------
(arm,shoulder) similarity score: 0.41052417978217554, gold ranking: 4.85
(arm,neck) similarity score: 0.4821267246409147, gold ranking: 1.58
----------------------------
(arm,body) similarity score: 0.46262796295190506, gold ranking: 4.05
(arm,neck) simi

###Method B

In [139]:
w2v_model = Word2Vec(tokens, vector_size=100, window=3, min_count=1, sample=0.005)

In [140]:
def word2vec_vector(word):
  wv = w2v_model.wv

  # tokenise in case input is actually multi-word
  word_tokens = [wv[t].reshape((1, -1)) for t in word_tokenize(word) if wv.has_index_for(t)]

  # if word_tokens is empty
  if not word_tokens:
    return None

  # sum the representation vectors
  return np.array(word_tokens).sum(axis=0)

In [141]:
get_results('data/Task-1-validation-dataset.csv', 'data/10491450-Task1-method-b-validation.csv', word2vec_vector, run_eval=True)

The following simalarity scores may need checking:
(absorb,learn) similarity score: 0.3720565736293793, gold ranking: 5.48
(absorb,withdraw) similarity score: 0.7458263039588928, gold ranking: 2.97
----------------------------
(acquire,get) similarity score: 0.5517366528511047, gold ranking: 8.82
(acquire,obtain) similarity score: 0.8811715245246887, gold ranking: 8.57
----------------------------
(apple,sauce) similarity score: 0.5571714043617249, gold ranking: 1.43
(apple,lemon) similarity score: 0.4903217554092407, gold ranking: 4.05
----------------------------
(apple,lemon) similarity score: 0.4903217554092407, gold ranking: 4.05
(apple,sunshine) similarity score: 0.5502339005470276, gold ranking: 0.58
----------------------------
(arm,shoulder) similarity score: 0.8811830282211304, gold ranking: 4.85
(arm,neck) similarity score: 0.8945770859718323, gold ranking: 1.58
----------------------------
(arm,body) similarity score: 0.7463340163230896, gold ranking: 4.05
(arm,neck) simila

###Results

In [142]:
get_results("data/Task-1-test-dataset2.csv", "data/10491450-Task1-method-a.csv", ppmi_vector)

In [143]:
get_results("data/Task-1-test-dataset2.csv", "data/10491450-Task1-method-b.csv", word2vec_vector)