In [82]:
# !pip install pandas scikit-learn scipy
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
import gensim.downloader as api
import time
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [60]:
df = pd.read_csv('./data/Training-dataset.csv')
val_df = pd.read_csv('./data/Task-1-validation-dataset.csv', header=None, index_col=None)
test_df = pd.read_csv('./data/Task-1-test-dataset1.csv', header=None, index_col=None)

## OPTION A - TF*IDF

In [61]:
ps = PorterStemmer()
def preprocess(text):
  return ' '.join([ps.stem(word) for word in text.lower().split()])
documents = df['plot_synopsis'].tolist()
# print(documents)

In [62]:
#a) sparse representation (BoW tf*idf)
#TF-IDF vectorizer (countvec + transform the BoW rep)
tfidfvectorizer = TfidfVectorizer(preprocessor=preprocess, analyzer='word') #some stop words are to be compared
tdidf_mtx = tfidfvectorizer.fit_transform(documents)
print(tdidf_mtx.shape)

(8257, 88460)


In [63]:
#embedding for OOV words
w2v_model = api.load("word2vec-google-news-300")

# Validation

In [83]:
#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
# cosine_sim = cosine_similarity(tdidf_mtx, tdidf_mtx)

result_df = pd.DataFrame(columns=['ID', 'Similarity'])
prediction_df = '10864332-Task1-method-a-validation.csv'
# df.apply(cos_sim, result_type='expand', axis=1) #use function instead

for index, row in val_df.iterrows():
  ID, word1, word2 = row[0], row[1], row[2]
  if word1 in tfidfvectorizer.vocabulary_ and word2 in tfidfvectorizer.vocabulary_:
    #get indices of the words
    word1_index = tfidfvectorizer.vocabulary_[word1]
    word2_index = tfidfvectorizer.vocabulary_[word2]
    #get vectors of the words' indices
    vector_word1 = tdidf_mtx[:, word1_index].reshape(1, -1)
    vector_word2 = tdidf_mtx[:, word2_index].reshape(1, -1)

    cosine_similarity_score = cosine_similarity(vector_word1, vector_word2)[0][0]
  else:
    #1 or both words OOV
    vector_word1 = w2v_model[word1] if word1 in w2v_model else np.zeros((300,))
    vector_word2 = w2v_model[word2] if word2 in w2v_model else np.zeros((300,))

    cosine_similarity_score = cosine_similarity(vector_word1.reshape(1, -1), vector_word2.reshape(1, -1))[0][0]

  result_df = pd.concat([result_df, pd.DataFrame([[ID, cosine_similarity_score]], columns=['ID', 'Similarity'])], ignore_index=True)

result_df.to_csv(prediction_df, index=False, header=False)


In [65]:
print(len(tfidfvectorizer.vocabulary_)) #size of representation

88460


In [66]:
!python task1_eval_script_student_version.py 10864332-Task1-method-a-validation.csv ./data/Task-1-validation-dataset.csv

The following simalarity scores may need checking:
(achieve,accomplish) similarity score: 0.0, gold ranking: 8.57
(achieve,try) similarity score: 0.0, gold ranking: 4.42
----------------------------
(acquire,get) similarity score: 0.00128361456672418, gold ranking: 8.82
(acquire,find) similarity score: 0.0023240677055002957, gold ranking: 6.38
----------------------------
(acquire,obtain) similarity score: 0.0, gold ranking: 8.57
(acquire,find) similarity score: 0.0023240677055002957, gold ranking: 6.38
----------------------------
(apple,sauce) similarity score: 0.0, gold ranking: 1.43
(apple,sunshine) similarity score: 0.0, gold ranking: 0.58
----------------------------
(apple,lemon) similarity score: 0.0, gold ranking: 4.05
(apple,sunshine) similarity score: 0.0, gold ranking: 0.58
----------------------------
(arm,shoulder) similarity score: 0.09339897690332263, gold ranking: 4.85
(arm,body) similarity score: 0.11874327787870616, gold ranking: 4.05
----------------------------
(ar

# Test

In [84]:
test_result_df = pd.DataFrame(columns=['ID', 'Similarity'])
prediction_df = '10864332-Task1-method-a.csv'

start = time.time()
for index, row in test_df.iterrows():
  ID, word1, word2 = row[0], row[1], row[2]
  if word1 in tfidfvectorizer.vocabulary_ and word2 in tfidfvectorizer.vocabulary_:

    word1_index = tfidfvectorizer.vocabulary_[word1]
    word2_index = tfidfvectorizer.vocabulary_[word2]

    vector_word1 = tdidf_mtx[:, word1_index].reshape(1, -1)
    vector_word2 = tdidf_mtx[:, word2_index].reshape(1, -1)

    cosine_similarity_score = cosine_similarity(vector_word1, vector_word2)[0][0]
  else:
    #1 or both words OOV
    vector_word1 = w2v_model[word1] if word1 in w2v_model else np.zeros((300,))
    vector_word2 = w2v_model[word2] if word2 in w2v_model else np.zeros((300,))

    cosine_similarity_score = cosine_similarity(vector_word1.reshape(1, -1), vector_word2.reshape(1, -1))[0][0]

  test_result_df = pd.concat([test_result_df, pd.DataFrame([[ID, cosine_similarity_score]], columns=['ID', 'Similarity'])], ignore_index=True)
end = time.time()

print(f"{end-start}s to process test data")
test_result_df.to_csv(prediction_df, index=False, header=False)


3.1268105506896973s to process test data


In [68]:
# !python task1_eval_script_student_version.py 10864332-Task-1-method-a.csv ./data/Task-1-test-dataset1.csv
# DON'T RUN eval_script on tests

## OPTION B - WORD2VEC

In [69]:
#b) dense static representation (word2vec)
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [70]:
# tokenized_documents = [word_tokenize(doc.lower()) for doc in documents] #tokenizes punctuation as well
tokenized_documents = [gensim.utils.simple_preprocess(doc.lower()) for doc in documents] #doesn't tokenize punct & symbols

In [71]:
#model params
model_w2v = Word2Vec(window=5, min_count=2, workers=4, sg=1)
model_w2v.build_vocab(tokenized_documents, progress_per=1000)
model_w2v.train(tokenized_documents, total_examples=model_w2v.corpus_count, epochs=model_w2v.epochs)

(25904042, 34003590)

In [72]:
print(model_w2v.vector_size)

100


# Validation

In [79]:
result_df = pd.DataFrame(columns=['ID', 'Similarity'])
prediction_df = '10864332-Task1-method-b-validation.csv'

for index, row in val_df.iterrows():
  ID, word1, word2 = row[0], row[1], row[2]
  if word1 in model_w2v.wv.key_to_index and word2 in model_w2v.wv.key_to_index:
    cosine_similarity = model_w2v.wv.similarity(word1, word2)
  else:
    vector_word1 = w2v_model[word1] if word1 in w2v_model else np.zeros((300,))
    vector_word2 = w2v_model[word2] if word2 in w2v_model else np.zeros((300,))
  result_df = pd.concat([result_df, pd.DataFrame([[ID, cosine_similarity]], columns=['ID', 'Similarity'])], ignore_index=True)

result_df.to_csv(prediction_df, index=False, header=False)
#consider multi-words

In [80]:
!python task1_eval_script_student_version.py 10864332-Task1-method-b-validation.csv ./data/Task-1-validation-dataset.csv

The following simalarity scores may need checking:
(absorb,learn) similarity score: 0.3099422, gold ranking: 5.48
(absorb,withdraw) similarity score: 0.534273, gold ranking: 2.97
----------------------------
(acquire,get) similarity score: 0.5361132, gold ranking: 8.82
(acquire,obtain) similarity score: 0.7717477, gold ranking: 8.57
----------------------------
(arm,body) similarity score: 0.48774445, gold ranking: 4.05
(arm,vein) similarity score: 0.57491076, gold ranking: 3.65
----------------------------
(arm,body) similarity score: 0.48774445, gold ranking: 4.05
(arm,knee) similarity score: 0.68754923, gold ranking: 2.75
----------------------------
(arm,body) similarity score: 0.48774445, gold ranking: 4.05
(arm,bone) similarity score: 0.540126, gold ranking: 3.83
----------------------------
(arm,body) similarity score: 0.48774445, gold ranking: 4.05
(arm,neck) similarity score: 0.74894786, gold ranking: 1.58
----------------------------
(arm,vein) similarity score: 0.57491076, g

# Test

In [77]:
result_df = pd.DataFrame(columns=['ID', 'Similarity'])
prediction_df = '10864332-Task1-method-b.csv'

start = time.time()

for index, row in test_df.iterrows():
  ID, word1, word2 = row[0], row[1], row[2]
  if word1 in model_w2v.wv.key_to_index and word2 in model_w2v.wv.key_to_index:
    cosine_similarity = model_w2v.wv.similarity(word1, word2)
  else:
    vector_word1 = w2v_model[word1] if word1 in w2v_model else np.zeros((300,))
    vector_word2 = w2v_model[word2] if word2 in w2v_model else np.zeros((300,))
  result_df = pd.concat([result_df, pd.DataFrame([[ID, cosine_similarity]], columns=['ID', 'Similarity'])], ignore_index=True)

end = time.time()
print(f"{end-start}s to process test data")

result_df.to_csv(prediction_df, index=False, header=False)

0.09110665321350098s to process test data


In [76]:
# !python task1_eval_script_student_version.py 10864332-Task1-method-b.csv ./data/Task-1-test-dataset1.csv
# DON'T RUN eval_script on tests