In [None]:
#If any other libraries need installing, please add them here

%pip install -U sentence-transformers
# %pip install -U sklearn
# %pip install -U gensim
# %pip install -U nltk

import pandas as pd
import string
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec, Phrases
from sentence_transformers import SentenceTransformer
import torch

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

In [None]:
df_train = pd.read_csv("./data/Training-dataset.csv", usecols = ['plot_synopsis'])
df_validation = pd.read_csv("./data/Task-1-validation-dataset.csv", usecols = [0,1,2], names=["id", "word1", "word2"])
df_test = pd.read_csv("./data/Task-1-test-dataset1.csv", usecols = [0,1,2], names=["id", "word1", "word2"])


#Gather term pairs from datasets to use for cosine similairty calculations
validation_term_pairs = df_validation[["word1", "word2"]].values
test_term_pairs = df_test[["word1", "word2"]].values

<h1>Word2Vec</h1>

In [None]:
#Function pre-processes documents via normalisation, tokenisation, punctuation removal, stop word removal and lemmatization

def w2v_preprocess_text(text):

    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)

    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemma = nltk.wordnet.WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokens]

    return tokens

df_train['plot_synopsis_w2v_processed'] = df_train['plot_synopsis'].apply(w2v_preprocess_text)
documents = df_train['plot_synopsis_w2v_processed'].tolist() #List of Lists where each inner list is a document of processed tokens

In [None]:
#Word2Vec Model with specified parameters

vector_size = 200
window_size = 1
min_count = 1
workers = 4
epochs = 10

w2v_model = Word2Vec(sentences=documents,
                     vector_size=vector_size,
                     window=window_size,
                     min_count=min_count,
                     workers=workers,
                     epochs=epochs)

In [None]:
#Calculates the cosine similairty for each term pair using Word2Vec model

def w2v_cosine_similarity(term1, term2, model):

  words = list(model.wv.index_to_key)
  if term1 in words and term2 in words:
    return model.wv.similarity(term1, term2)
  else:
    return 0.5 #if either word isn't found in the model, return a similairty score of 0.5


w2v_val_similarity = []
for x in range (len(df_validation)):
  w2v_val_similarity.append(w2v_cosine_similarity(validation_term_pairs[x][0], validation_term_pairs[x][1], w2v_model))

w2v_test_similarity = []
for x in range (len(df_test)):
  w2v_test_similarity.append(w2v_cosine_similarity(test_term_pairs[x][0], test_term_pairs[x][1], w2v_model))


In [None]:
#Creates CSV files from the validation and test results

df_w2v_validation = pd.DataFrame({
    'id': df_validation['id'],
    'similarity': w2v_val_similarity
})

df_w2v_validation.to_csv("10560407-Task1-method-b-validation.csv", index=False, header=False)


df_w2v_test = pd.DataFrame({
    'id': df_test['id'],
    'similarity': w2v_test_similarity
})

df_w2v_test.to_csv("10560407-Task1-method-b.csv", index=False, header=False)

 <h1>RoBERTa</h1>

In [None]:
roberta_model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')

In [None]:
#Calculates the cosine similairty for each term pair using RoBERTa model

def roberta_cosine_similarity(word1, word2, model):
    vec1 = model.encode(word1).reshape(1,-1)
    vec2 = model.encode(word2).reshape(1,-1)
    similarity = cosine_similarity(vec1, vec2)[0][0]
    return similarity

roberta_val_similarity = []
for x in range (len(df_validation)):
  roberta_val_similarity.append(roberta_cosine_similarity(validation_term_pairs[x][0], validation_term_pairs[x][1], roberta_model))#

roberta_test_similarity = []
for x in range (len(df_test)):
  roberta_test_similarity.append(roberta_cosine_similarity(test_term_pairs[x][0], test_term_pairs[x][1], roberta_model))

In [None]:
#Creates CSV files from the validation and test results

df_roberta_validation = pd.DataFrame({
    'id': df_validation['id'],
    'similarity': roberta_val_similarity
})

df_roberta_validation.to_csv("10560407-Task1-method-c-validation.csv", index=False, header=False)

df_roberta_validation = pd.DataFrame({
    'id': df_test['id'],
    'similarity': roberta_test_similarity
})

df_roberta_validation.to_csv("10560407-Task1-method-c.csv", index=False, header=False)