# Task 1

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# preprocessing
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

# part 1
from sklearn.feature_extraction.text import TfidfVectorizer

# part 2
from gensim.models import Word2Vec

## import datasets

In [2]:
dataset='test'

In [3]:
data = pd.read_csv('data/Training-dataset.csv')
if dataset=='val':
    val = pd.read_csv('data/Task-1-validation-dataset.csv')
else:
    val = pd.read_csv('data/Task-1-test-dataset1.csv') # but is actually test but will be too much work

In [4]:
data.head()

Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,8f5203de-b2f8-4c0c-b0c1-835ba92422e9,Si wang ta,"After a recent amount of challenges, Billy Lo ...",0,0,0,0,1,1,0,0,1
1,6416fe15-6f8a-41d4-8a78-3e8f120781c7,Shattered Vengeance,"In the crime-ridden city of Tremont, renowned ...",0,0,0,0,1,1,1,0,1
2,4979fe9a-0518-41cc-b85f-f364c91053ca,L'esorciccio,Lankester Merrin is a veteran Catholic priest ...,0,1,0,0,0,0,0,0,0
3,b672850b-a1d9-44ed-9cff-025ee8b61e6f,Serendipity Through Seasons,"""Serendipity Through Seasons"" is a heartwarmin...",0,0,0,0,0,0,1,0,0
4,b4d8e8cc-a53e-48f8-be6a-6432b928a56d,The Liability,"Young and naive 19-year-old slacker, Adam (Jac...",0,0,1,0,0,0,0,0,0


In [5]:
# reformat val dataframe
record = pd.DataFrame([val.columns.tolist()], columns=val.columns)
val = pd.concat([record, val], ignore_index=True)
if dataset=='val':
    val.columns = ['id', 'term1', 'term2', 'ranking']
else:
    val.columns = ['id', 'term1', 'term2']

val.head()

Unnamed: 0,id,term1,term2
0,816,accept,acknowledge
1,957,accept,recommend
2,809,agree,argue
3,911,agree,please
4,242,alcohol,cocktail


In [6]:
# convert id from string to int
val['id'] = val['id'].astype(int)


## Tokenisation and Preprocessing

In [7]:
def processText(text):
    # case folding
    text = text.lower()

    # Stop words removal
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)

    # # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words)
    
    return text

In [8]:
# processed documents
documents = data['plot_synopsis'].apply(processText).tolist()

# tokenize each document into sentences and then tokenize each sentence into words
sentences = [word_tokenize(sentence) for document in documents for sentence in sent_tokenize(document)]

## Sparse Vector Representation 1: VSM

### compute vector representation

In [9]:
# create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# fit the vectorizer to the documents and transform them into tf-idf vectors
tfidf_vectors = tfidf_vectorizer.fit_transform(documents).toarray()

# get the vocab
vocab = tfidf_vectorizer.vocabulary_

print(f'tfidf vectors shape: {tfidf_vectors.shape}')

tfidf vectors shape: (8257, 88176)


### compute similarity

In [10]:
val['similarity1'] = 0.0

# for each pair in val
for index, row in val.iterrows():
    # get the terms
    term1 = row['term1']
    term2 = row['term2']
    
    # get the index of the terms in vocab
    term1_index = vocab.get(term1)
    term2_index = vocab.get(term2)
    
    # check if term exists in vocab
    if term1_index is not None and term2_index is not None:
        # get the tf-idf vector for each term
        term1_tfidf = tfidf_vectors[:, term1_index].reshape(1, -1)
        term2_tfidf = tfidf_vectors[:, term2_index].reshape(1, -1)

        # compute the cosine similarity
        similarity = cosine_similarity(term1_tfidf, term2_tfidf)[0][0]
    
    else:
        similarity = 0
        
    # record similarity to df
    val.at[index, 'similarity1'] = similarity

In [11]:
val.head()

Unnamed: 0,id,term1,term2,similarity1
0,816,accept,acknowledge,0.042112
1,957,accept,recommend,0.041259
2,809,agree,argue,0.06119
3,911,agree,please,0.029395
4,242,alcohol,cocktail,0.030637


## Dense Vector Representation 2: word2vec

### compute vector representation

In [12]:
cbow_model = Word2Vec(sentences, min_count = 1, window = 5)

### compute similarity

In [13]:
val['similarity2'] = 0.0

# for each pair in val
for index, row in val.iterrows():
    term1 = row['term1']
    term2 = row['term2']

    # check if term exists in the model's vocab
    if term1 in cbow_model.wv.key_to_index and term2 in cbow_model.wv.key_to_index:
        similarity = cbow_model.wv.similarity(term1, term2)
    else:
        similarity = 0
    
    # record similarity to df
    val.at[index, 'similarity2'] = similarity

## Results

In [14]:
val.head(10)

Unnamed: 0,id,term1,term2,similarity1,similarity2
0,816,accept,acknowledge,0.042112,0.70306
1,957,accept,recommend,0.041259,0.585162
2,809,agree,argue,0.06119,0.593694
3,911,agree,please,0.029395,0.396377
4,242,alcohol,cocktail,0.030637,0.506321
5,697,alcohol,wine,0.006492,0.715401
6,2066,announcement,news,0.035266,0.657762
7,2164,announcement,effort,0.010059,0.372791
8,14,bad,terrible,0.033781,0.633458
9,51,bad,great,0.065086,0.422489


In [15]:
# results from first vector representation

results1 = val[['id', 'similarity1']].copy()

if dataset=='val':
    results1.to_csv('data/10697685-Task1-method-a-validation.csv', index=False, header=False)
else:
    results1.to_csv('data/10697685-Task1-method-a.csv', index=False, header=False)

In [16]:
# results from second vector representation

results2= val[['id', 'similarity2']].copy()

if dataset=='val':
    results2.to_csv('data/10697685-Task1-method-b-validation.csv', index=False, header=False)
else:
    results2.to_csv('data/10697685-Task1-method-b.csv', index=False, header=False)