# Task 1: Distributional semantics

## Preprocessing data for both Representations

In [1]:
!pip install gensim

from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import math
import collections
from collections import defaultdict, Counter

# Download NLTK resources required for preproccessing
nltk.download('punkt') # Tokenizer
nltk.download('stopwords') # Stopwords list
nltk.download('wordnet') # Lemmatizer

# Initialize lemmatizer and stop words set
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation using string translation
    tokens = word_tokenize(text)  # Tokenize the text
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Filter out stopwords and lemmatize the tokens
    return lemmatized_tokens


# Load dataset
train_data = pd.read_csv('./data/Training-dataset.csv')

# Comment/uncomment for validating code with development/test dataset #

# validation data:
# validation_data = pd.read_csv('./data/Task-1-validation-dataset.csv', header=None)
# validation_data.columns = ['term_pair_id', 'term1', 'term2', 'gold_standard_similarity'] # Defining manually the column names for the validation dataset

#test data:
validation_data = pd.read_csv('./data/Task-1-test-dataset1.csv', header=None)
validation_data.columns = ['term_pair_id', 'term1', 'term2']

# Preprocess text data in the 'plot_synopsis'
train_data['processed_plot_synopsis'] = train_data['plot_synopsis'].apply(preprocess_text)


# At this point the training data is ready for creating representations




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 1st representation(option A): A sparse representation with PPMI


In [2]:
# A function to build a co-occurrence matrix
def build_matrix(token_lists, window=1):
    cooccurrence_dict = defaultdict(Counter)
    for token_list in token_lists:
        token_count = len(token_list)

        for index, token in enumerate(token_list):
            # Determine the context window and update counts
            context_indices = list(range(max(0, index - window), min(token_count, index + window + 1)))
            context_indices.remove(index)
            context_tokens = [token_list[i] for i in context_indices]
            cooccurrence_dict[token].update(context_tokens)

    return cooccurrence_dict

# A function to calculate the PPMI matrix
def ppmi_matrix_calc(cooccurrence_dict):
    all_cooccurrences = sum(sum(counter.values()) for counter in cooccurrence_dict.values())
    freq_word = {word: sum(counter.values()) for word, counter in cooccurrence_dict.items()}
    # Create PPMI matrix
    ppmi_mat = np.zeros((len(cooccurrence_dict), len(cooccurrence_dict)))
    word_to_idx = {word: index for index, word in enumerate(cooccurrence_dict.keys())}
    # Calculate PPMI values
    for word, context_counts  in cooccurrence_dict.items():
        for context_word, cooccurrence_count  in context_counts.items():
            pointwise_info = math.log2((cooccurrence_count  * all_cooccurrences) / (freq_word[word] * freq_word[context_word])) # Pointwise mutual information
            ppmi_mat[word_to_idx[word], word_to_idx[context_word]] = max(pointwise_info, 0) # Truncate negative values to zero for PPMI

    return ppmi_mat, word_to_idx

# A function to generate a PPMI vector for a term (could be single or multi-word)
def get_ppmi_vector(term, ppmi_mat, word_to_idx):
  # Retrieve vectors for each word in the term
  vectors = [ppmi_mat[word_to_idx[word]] for word in term.split() if word in word_to_idx]
  # Calculate the mean vector for the term
  return np.mean(vectors, axis=0) if vectors else np.zeros((ppmi_mat.shape[1],))

# a function to calculate cosine similarity between two terms using PPMI to represent context
def cosine_similarity_ppmi_multi(term1, term2, ppmi_mat, word_to_idx):
  # Get PPMI vectors for both terms
  vector1 = get_ppmi_vector(term1, ppmi_mat, word_to_idx)
  vector2 = get_ppmi_vector(term2, ppmi_mat, word_to_idx)
  return cosine_similarity([vector1], [vector2])[0][0]


# Building PPMI matrix from processed training data
cooccurrence_dict = build_matrix(train_data['processed_plot_synopsis'].tolist())
ppmi_mat, word_to_idx = ppmi_matrix_calc(cooccurrence_dict)


# Calculating cosine similarity for validation dataset
cosine_similarity_ppmi = [
    cosine_similarity_ppmi_multi(row['term1'], row['term2'], ppmi_mat, word_to_idx)
    for _, row in validation_data.iterrows()
]

# Put cosine similarity results to a file with specified format
cosine_similarity_ppmi_result = pd.DataFrame({
    'term_pair_id': validation_data['term_pair_id'],
    'similarity': cosine_similarity_ppmi
})
cosine_similarity_ppmi_result.to_csv('./data/10928627-Task1-method-a.csv', index=False, header=False)

In [3]:
print("vocabulary size:")
print(len(word_to_idx))

vocabulary size:
127527


## 2nd representation(option B): A dense static representation with Word2vec

In [4]:
# Tokenize data to make it ready for training Word2vec model
tokenize = train_data['processed_plot_synopsis'].tolist()

# Train model, word2vec CBoW
model_trained = Word2Vec(sentences=tokenize,
                         vector_size=100, # Determines dimensionality of word vectors
                         window=5,        # Maximum distance between current and predicted word within sentence
                         min_count=1,     # Ignores all words with total frequency lower than this
                         workers=4)       # Number of worker threads used to train the model (faster training with multicore machines)

# Vector representation for multi-word terms using Word2Vec
def vector_for_multiword_word2vec(term, representation):
    vectors= [representation.wv[word] for word in term.split() if word in representation.wv.key_to_index]
    return np.mean(vectors, axis=0) if vectors else np.zeros((representation.vector_size,))

# A function calculating cosine similarity between two terms using Word2Vec representations
def cosine_similarity_word2vec_multi(term1, term2, representation):
    vector1 = vector_for_multiword_word2vec(term1, representation)
    vector2 = vector_for_multiword_word2vec(term2, representation)
    return cosine_similarity([vector1], [vector2])[0][0]


cosine_similarity_word2vec = [
    cosine_similarity_word2vec_multi(row['term1'], row['term2'], model_trained)
    for _, row in validation_data.iterrows()
]

cosine_similarity_word2vec_result = pd.DataFrame({
    'term_pair_id': validation_data['term_pair_id'],
    'similarity': cosine_similarity_word2vec
})
cosine_similarity_word2vec_result.to_csv('./data/10928627-Task1-method-b.csv', index=False, header=False)

