# TASK 1

### Import Libraries and Download Packages

In [3]:
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize, download
from nltk.util import ngrams
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
import time

In [4]:
# Download "stopwords" and "punkt" NLTK packages
download("stopwords")
download("punkt")
download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

### Data Initialisation


In [48]:
# Training data (replace filepath if change is needed)
train_data = pd.read_csv("./data/Training-dataset.csv")

# Testing data (replace filepath if change is needed)
# test_data = pd.read_csv("./data/Task-1-validation-dataset.csv", names=["ID", "Term A", "Term B", "Similarity"])
test_data = pd.read_csv("./data/Task-1-test-dataset1.csv", names=["ID", "Term A", "Term B", "Similarity"])

documents = train_data["plot_synopsis"].to_numpy()

test_data

Unnamed: 0,ID,Term A,Term B,Similarity
0,816,accept,acknowledge,
1,957,accept,recommend,
2,809,agree,argue,
3,911,agree,please,
4,242,alcohol,cocktail,
...,...,...,...,...
97,160,take,leave,
98,14,area,region,
99,16,area,corner,
100,4012,journey,long distance,


### Text Preprocessing Function

In [49]:
def text_preprocessing(document,
                       caseFolding=False,
                       removeStopwords=False,
                       useLemmatizer=False,
                       useStemmer=False):
  # Tokenization
  tokens = word_tokenize(document)

  # Case folding - convert every tokens to lowercase
  if caseFolding:
    tokens = [t.lower() for t in tokens]

  # Stop-words removal
  if removeStopwords:
    stop_words = set(stopwords.words("english"))
    tokens = [t for t in tokens if not t.lower() in stop_words]

  # Lemmatization
  if useLemmatizer:
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

  # Stemming
  if useStemmer:
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]

  preprocessed_document = " ".join(tokens)

  return preprocessed_document

## a) Bag of Words with tf*idf

### Words Preprocessing

In [50]:
# Documents preprocessing
# Estimated time : 40-90s

preprocessed_docs = [text_preprocessing(d, False, True) for d in documents]

### Model Implementation

In [51]:
# IDF smoothing enabled by default to prevent zero divisions

# Multi-word representation can be done by changing 'ngram_range'
# e.g. Unigram - (1, 1), Bigram - (2, 2), Unigram & Bigram - (1, 2)

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
matrixBoW = vectorizer.fit_transform(preprocessed_docs)
vocabulary = vectorizer.get_feature_names_out()
print("Vocabulary :", vocabulary)
print("Length :", len(vocabulary))

Vocabulary : ['00' '00 00' '00 000' ... '齊天大聖 great' '齐天大圣' '齐天大圣 traditional']
Length : 2659845


### Cosine Similarity Calculation

In [52]:
# Rows : documents
# Columns : words
matrixBoW.shape

(8257, 2659845)

In [53]:
test_data

Unnamed: 0,ID,Term A,Term B,Similarity
0,816,accept,acknowledge,
1,957,accept,recommend,
2,809,agree,argue,
3,911,agree,please,
4,242,alcohol,cocktail,
...,...,...,...,...
97,160,take,leave,
98,14,area,region,
99,16,area,corner,
100,4012,journey,long distance,


Define similarity calculation function

In [54]:
def calculate_similarity_a(termA, termB):

  # Get index of termA and termB
  try:
    termA_idx = list(vocabulary).index(termA)
    termB_idx = list(vocabulary).index(termB)
  except ValueError:
    # Deal with OOV words
    # Returns 0 instead
    return 0.0

  # Get matrices based on retrieved indices
  matrixA = matrixBoW[:, termA_idx].toarray().T
  matrixB = matrixBoW[:, termB_idx].toarray().T

  # Calculate and return cosine similarity between the 2 matrices
  return cosine_similarity(matrixA, matrixB).flatten()[0]

In [55]:
calculate_similarity_a("acquire", "obtain")

0.08163987558068528

In [56]:
result = []
time_taken = []

for data in test_data.values.tolist():

  # Calculate similarity score and measure time taken
  start = time.time()
  score = calculate_similarity_a(data[1], data[2])
  end = time.time()

  # Append result
  result.append(list((data[0], score)))
  time_taken.append(end - start)


print("Total time taken :", sum(time_taken), "seconds")
print("Result :", result)

Total time taken : 90.47821831703186 seconds
Result : [[816, 0.04090188834120051], [957, 0.040580648076480465], [809, 0.0706891708335188], [911, 0.03434293084141965], [242, 0.02720543261839694], [697, 0.008877992814267762], [2066, 0.037018241102301494], [2164, 0.01712031230974136], [14, 0.04257466407112351], [51, 0.0721361123950908], [176, 0.023493726692869792], [402, 0.14245627889785945], [169, 0.05525319447269097], [279, 0.0], [883, 0.06361760627138865], [966, 0.2513404867355678], [633, 0.09558906401769464], [2026, 0.07824850987188792], [2030, 0.0498470409510645], [2154, 0.006594550568486691], [726, 0.014329434986610202], [2008, 0.12072673857956585], [772, 0.02817216425234451], [2081, 0.011849036443350473], [189, 0.17745398903860637], [496, 0.3520646132779678], [24, 0.006296582030368813], [50, 0.015654690383843074], [2018, 0.0048043820445363514], [2019, 0.013766616135586381], [121, 0.0291816999965935], [709, 0.2325787892139486], [813, 0.04470311823695068], [977, 0.13399839714932685],

In [57]:
result_df = pd.DataFrame(result)
result_df.to_csv("10812451-Task1-method-a.csv", index=False, header=False)

## b) Word2Vec

### Words Preprocessing

In [62]:
# Documents preprocessing
# Estimated time : 60s-120s for unigram

# Set n for n-gram based on validation/test data
n = 1

preprocessed_docs = [text_preprocessing(d, False, True, True) for d in documents]
preprocessed_docs = [ngrams(word_tokenize(d), n) for d in preprocessed_docs]
w2v_docs = []

for doc in preprocessed_docs:
  w2v_docs.append([" ".join(token) for token in doc])


### Model Implementation

In [63]:
# Word2Vec CBoW Implementation
# Estimated time : 35s for 1-gram, 115s for 2-gram

model = Word2Vec(w2v_docs, vector_size=128, window=5, min_count=1, sg=0)

In [64]:
len(model.wv)

136350

In [65]:
model.wv.most_similar("establish")

[('concoct', 0.8866039514541626),
 ('renew', 0.8839808106422424),
 ('disrupt', 0.8558264970779419),
 ('procure', 0.8526293039321899),
 ('undermine', 0.8491823673248291),
 ('accomplish', 0.8473575115203857),
 ('bolster', 0.8465397357940674),
 ('recreate', 0.8459774851799011),
 ('adapt', 0.8456436395645142),
 ('reclaim', 0.844632625579834)]

### Cosine Similarity Calculation

Define similarity calculation function

In [66]:
def calculate_similarity_b(termA, termB):
  # Get word vector for terms A and B
  try:
    return model.wv.similarity(termA, termB)
  except KeyError:
    # Deal with OOV words
    # Returns 0 instead
    return 0.0

In [67]:
print(calculate_similarity_b("acquire", "OUT"))
print(calculate_similarity_b("tiger", "animal"))

0.0
0.72729


In [68]:
result = []
time_taken = []

for data in test_data.values.tolist():

  # Calculate similarity score and measure time taken
  start = time.time()
  score = calculate_similarity_b(data[1], data[2])
  end = time.time()

  # Append result
  result.append(list((data[0], score)))
  time_taken.append(end - start)

print("Total time taken :", sum(time_taken), "seconds")
print("Result :", result)

Total time taken : 0.005571126937866211 seconds
Result : [[816, 0.69885874], [957, 0.4796449], [809, 0.6196959], [911, 0.55084145], [242, 0.5916214], [697, 0.7486172], [2066, 0.63884914], [2164, 0.35054162], [14, 0.67152107], [51, 0.6204053], [176, 0.5287957], [402, 0.52434343], [169, 0.6352461], [279, 0.5516621], [883, 0.11013872], [966, 0.14834176], [633, 0.45151824], [2026, 0.6547652], [2030, 0.65808785], [2154, 0.53809583], [726, 0.19105278], [2008, 0.766931], [772, 0.35994518], [2081, 0.6686959], [189, 0.6945845], [496, 0.5743444], [24, 0.77655447], [50, 0.50853026], [2018, 0.8716816], [2019, 0.9149811], [121, 0.5318381], [709, 0.2600261], [813, 0.3967045], [977, 0.5082814], [248, 0.83662295], [264, 0.69416666], [7, 0.657032], [27, 0.52517915], [3, 0.6899107], [5, 0.75078964], [203, 0.7974881], [261, 0.26785976], [791, 0.64052016], [989, 0.4991877], [440, 0.48801857], [593, 0.4399482], [819, 0.0], [932, 0.48589712], [19, 0.79745626], [110, 0.42497993], [823, 0.48987156], [842, 0.6

In [69]:
result_df = pd.DataFrame(result)
result_df.to_csv("10812451-Task1-method-b.csv", index=False, header=False)