# The validation data and the test data for method a (PPMI)

In [19]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import math
import collections
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# preprocess text data
def clean_text(input_text):
    lowercased_text = input_text.lower()
    tokens = word_tokenize(lowercased_text)

    alphabetic_tokens = []
    for token in tokens:
        if token.isalpha():
            alphabetic_tokens.append(token)

    stop_words = set(stopwords.words('english'))
    filtered_tokens = []
    for token in alphabetic_tokens:
        if token not in stop_words:
            filtered_tokens.append(token)

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for token in filtered_tokens:
        lemmatized_token = lemmatizer.lemmatize(token)
        lemmatized_tokens.append(lemmatized_token)

    return lemmatized_tokens


training_data_path = './data/Training-dataset.csv'
training_data = pd.read_csv(training_data_path)
training_data['processed_tokens'] = training_data['plot_synopsis'].apply(clean_text)

window_size = 1
co_occurrence = collections.defaultdict(lambda: collections.Counter())
for plot in training_data['processed_tokens']:
    for i in range(len(plot)):
        token = plot[i]
        for j in range(max(0, i - window_size), min(len(plot), i + window_size + 1)):
            if i != j:
                co_occurrence[token][plot[j]] += 1

total_co_occurrences = sum(sum(counter.values()) for counter in co_occurrence.values())
vocab_size = len(co_occurrence)
ppmi_matrix = np.zeros((vocab_size, vocab_size))
word_to_index = {word: i for i, word in enumerate(co_occurrence.keys())}
print("Vocabulary size:", vocab_size)
for word, contexts in co_occurrence.items():
    for context, count in contexts.items():
        pmi = math.log2((count * total_co_occurrences) / (sum(co_occurrence[word].values()) * sum(co_occurrence[context].values())))
        ppmi_matrix[word_to_index[word]][word_to_index[context]] = max(pmi, 0)

#data_path = './data/Task-1-validation-dataset.csv'
#data = pd.read_csv(data_path, header=None)
#data.columns = ['id', 'term1', 'term2', 'similarity']
data_path = './data/Task-1-test-dataset1.csv'
data = pd.read_csv(data_path, header=None)
data.columns = ['id', 'term1', 'term2']

# cosine similarities
ppmi_similarities = []
for _, row in data.iterrows():
    term1, term2 = row['term1'], row['term2']
    similarity = 0
    if term1 in word_to_index and term2 in word_to_index:
        vec1 = ppmi_matrix[word_to_index[term1]]
        vec2 = ppmi_matrix[word_to_index[term2]]
        similarity = cosine_similarity([vec1], [vec2])[0][0]
    ppmi_similarities.append(similarity)

ppmi_results_df = pd.DataFrame({'term_pair_id': data['id'],'similarity': ppmi_similarities})
ppmi_results_df.to_csv('10879201-Task1-method-a.csv', index=False, header=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Vocabulary size: 79144


# The validation data and the test data method for b (word2voc)

In [20]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# preprocess text data
def clean_text(input_text):
    lowercased_text = input_text.lower()
    tokens = word_tokenize(lowercased_text)

    alphabetic_tokens = []
    for token in tokens:
        if token.isalpha():
            alphabetic_tokens.append(token)

    stop_words = set(stopwords.words('english'))
    filtered_tokens = []
    for token in alphabetic_tokens:
        if token not in stop_words:
            filtered_tokens.append(token)

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for token in filtered_tokens:
        lemmatized_token = lemmatizer.lemmatize(token)
        lemmatized_tokens.append(lemmatized_token)

    return lemmatized_tokens

training_data_path = './data/Training-dataset.csv'
training_data = pd.read_csv(training_data_path)
training_data['cleaned_plot'] = training_data['plot_synopsis'].apply(clean_text)
tokenized_plots = training_data['cleaned_plot'].tolist()
word2vec_model = Word2Vec(sentences=tokenized_plots, vector_size=100, window=5, min_count=1, workers=4)

#data_path = './data/Task-1-validation-dataset.csv'
#data = pd.read_csv(data_path, header=None)
#data.columns = ['id', 'term1', 'term2', 'similarity']
data_path = './data/Task-1-test-dataset1.csv'
data = pd.read_csv(data_path, header=None)
data.columns = ['id', 'term1', 'term2']

word2vec_similarities = []
for _, row in data.iterrows():
    term1, term2 = row['term1'], row['term2']
    similarity = 0

    if term1 in word2vec_model.wv.key_to_index:
        term1_emb = word2vec_model.wv[term1]
    else:
        term1_emb = np.zeros(word2vec_model.vector_size)

    if term2 in word2vec_model.wv.key_to_index:
        term2_emb = word2vec_model.wv[term2]
    else:
        term2_emb = np.zeros(word2vec_model.vector_size)

    similarity = cosine_similarity([term1_emb], [term2_emb])[0][0]
    word2vec_similarities.append(similarity)

# Save the results to a CSV file
word2vec_results_df = pd.DataFrame({'term_pair_id': data['id'], 'similarity': word2vec_similarities})
word2vec_results_df.to_csv('10879201-Task1-method-b.csv', index=False, header=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
