# Import Necessary Libraries

In [None]:
import pandas as pd
import time
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
import string
from gensim.models import Word2Vec
import numpy as np
from collections import defaultdict
from IPython.utils.capture import capture_output
from math import log2
from sklearn.metrics.pairwise import cosine_similarity

# Load nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

print("Imported all necessary libraries successfully")

Imported all necessary libraries successfully


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data Processing (Cleaning & Preprocessing)




In [None]:
def preprocess_text_sentence_level(text, remove_stopwords=True, lemmatize=True, stem=False):
    # Split text into sentences
    sentences = sent_tokenize(text)
    processed_sentences = []

    # Set up stop words, lemmatizer, and stemmer
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")

    # Process each sentence
    for sentence in sentences:
        # Tokenize sentence
        tokens = word_tokenize(sentence)
        # Convert to lowercase and remove non-alphanumeric tokens i.e. punctiation
        tokens = [token.lower() for token in tokens if token.isalnum()]
        # Remove stopwords
        if remove_stopwords:
            tokens = [token for token in tokens if token not in stop_words]
        # Lemmatize
        if lemmatize:
          # Since the lemmatizer as it's default pos is noun then the lemma of media is medium so it gets removed from the corupus
          tokens = [lemmatizer.lemmatize(token) if token != 'media' else token for token in tokens]
        # Stem
        if stem:
            tokens = [stemmer.stem(token) for token in tokens]
        # Add the processed sentence
        processed_sentences.append(tokens)

    return processed_sentences


# Data Preparation

### Training Data

In [None]:
# Load the dataset
training_data_path = './data/Training-dataset.csv'
training_data = pd.read_csv(training_data_path)

# Apply preprocessing to each plot synopsis
training_data['processed_plot_synopsis'] = training_data['plot_synopsis'].apply(preprocess_text_sentence_level, stem=False)

# Flatten the list of processed synopses to feed into Word2Vec
synopses = [synopsis for sublist in training_data['processed_plot_synopsis'].tolist() for synopsis in sublist]

In [None]:
all_tokens = [token for sublist in synopses for token in sublist]
# Convert the list to a set to get unique tokens
unique_tokens = set(all_tokens)
# Return the number of unique tokens
print(len(unique_tokens))

80437


### Validation Data

In [None]:
# Load the validation dataset, assuming no header in the CSV
validation_dataset_path = './data/Task-1-validation-dataset.csv'
validation_data = pd.read_csv(validation_dataset_path, header=None)

### Testing Data

In [None]:
# Load the testing dataset, assuming no header in the CSV
testing_dataset_path = './data/Task-1-test-dataset1.csv'
testing_data = pd.read_csv(testing_dataset_path, header=None)

# Method A : PPMI

The following code used dictionaries instead of matrices due to the fact that there is not enough RAM in colab to accommodate such a sparse and big matrix of 80k x 80k. Hence to mitigate this dictionaries were used as they are much faster in retrieval time and would act in the same way as a matrix because each entry of the matrix is not stored in the dictionary.

In [62]:
def build_co_occurrence_dict(sentences, window_size=2, smoothness = 0):
    # Tokenize and flatten the sentences into a list of words
    words = [word for sentence in sentences for word in sentence]

    # Count the unique words
    vocab = set(words)
    word_to_id = {word: i for i, word in enumerate(vocab)}

    # Initialize an co-occurrence dictionary with values smoothness
    co_occurrence_dict = defaultdict(lambda : smoothness)

    # Iterate over sentences with a progress bar
    for sentence in tqdm(sentences, desc='Building Co-occurrence Dictionary'):
        # For each word, consider 'window_size' words before and after as the context
        for i, word in enumerate(sentence):
            # Define the window range for the context
            start = max(i - window_size, 0)
            end = min(i + window_size + 1, len(sentence))
            context = sentence[start:i] + sentence[i+1:end]  # Exclude the target word itself

            # Increment counts in the dictionary for the target word and context word pair
            target_id = word_to_id[word]
            for context_word in context:
                context_id = word_to_id[context_word]
                if target_id != context_id:  # Optional: exclude self-co-occurrence
                    co_occurrence_dict[(target_id, context_id)] += 1

    return dict(co_occurrence_dict), word_to_id

co_occurrence_dict, word_to_id = build_co_occurrence_dict(synopses, window_size=1, smoothness = 0)

Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [01:28<00:00, 3826.37it/s]


In [63]:
def calculate_ppmi(co_occurrence_dict, word_to_id, total_occurrences, word_freq, context_freq):
    ppmi_dict = {}

    # Iterate over each word pair in the co-occurrence dictionary
    for (word_id, context_id), co_occurrence in tqdm(co_occurrence_dict.items(), desc="Calculating PPMI"):
        p_wc = co_occurrence / total_occurrences
        p_w = word_freq[word_id] / total_occurrences
        p_c = context_freq[context_id] / total_occurrences
        ppmi = max(0, log2(p_wc / (p_w * p_c)) if p_wc > 0 else 0)
        ppmi_dict[(word_id, context_id)] = ppmi

    return ppmi_dict

In [64]:
# Calculate total_occurrences, word_freq, and context_freq from co_occurrence_dict
total_occurrences = sum(co_occurrence_dict.values())
vocab_size = len(word_to_id)
word_freq = [0] * vocab_size
context_freq = [0] * vocab_size

# Accumulate word and context frequencies
for (word_id, context_id), count in co_occurrence_dict.items():
    word_freq[word_id] += count
    context_freq[context_id] += count

# Now calculate PPMI values
ppmi_dict = calculate_ppmi(co_occurrence_dict, word_to_id, total_occurrences, word_freq, context_freq)

Calculating PPMI: 100%|██████████| 3987350/3987350 [00:07<00:00, 564690.33it/s]


In [70]:
def calculate_similarity_ppmi(word1, word2):
  # Check if both words are in the vocabulary
    if word1 not in word_to_id or word2 not in word_to_id:
        print(f"({word1}, {word2}) not found.")
        return 0.01
    # Initialize vectors with zeros
    vec1 = np.zeros(len(word_to_id))
    vec2 = np.zeros(len(word_to_id))

    # Populate the vectors with PPMI values from ppmi_dict
    for (w1_id, context_id), ppmi_value in ppmi_dict.items():
        if word_to_id[word1] == w1_id:
            vec1[context_id] = ppmi_value
    for (w2_id, context_id), ppmi_value in ppmi_dict.items():
        if word_to_id[word2] == w2_id:
            vec2[context_id] = ppmi_value

    # Calculate and return the cosine similarity
    similarity = cosine_similarity([vec1], [vec2])[0][0]
    return similarity

In [71]:
def create_prediction_csv_ppmi(data, dataset):
  # Drop the last column
  prediction_data = data.copy()
  prediction_data.drop(prediction_data.columns[1:], axis=1, inplace=True)

  # Calculate similarity and add as a new column
  prediction_data[1] = data.apply(lambda row: calculate_similarity_ppmi(row[1], row[2]), axis=1)
  if dataset == 'validation':
    # Save the new dataset to a CSV file
    prediction_path = './data/10726993-Task1-method-a-validation.csv'
    prediction_data.to_csv(prediction_path, index=False, header=False)
  if dataset == 'testing':
    # Save the new dataset to a CSV file
    prediction_path = './data/10726993-Task1-method-a.csv'
    prediction_data.to_csv(prediction_path, index=False, header=False)

### Creating CSVs

Validation

In [72]:
create_prediction_csv_ppmi(validation_data, "validation")

(cup, tableware) not found.


Testing

In [74]:
create_prediction_csv_ppmi(testing_data, "testing")

(keep, possess) not found.
( war criminal, student) not found.
( war criminal, jet) not found.
(brutal murder, instructor) not found.
(brutal murder, terrible) not found.
(college graduate, teacher) not found.
(college graduate, job) not found.
(boy, teenage couple) not found.
(cat, teenage couple) not found.
(take, possess) not found.
(journey, long distance) not found.
(area, long distance) not found.


## Hyperparameter Tuning

Ignore this as it was for testing purposes ( code may not work if you run it ) also it took 2hours so don't bother

In [None]:
def calculate_accuracy():
  with capture_output() as c:
      %run task1_eval_script_student_version.py ./data/10726993-Task1-method-a-validation.csv ./data/Task-1-validation-dataset.csv
      accuracy = c.stdout.splitlines()[-1]
      accuracy = accuracy.split()[-1]
      return float(accuracy)

In [None]:
for window in [1,2,3,4,5]:
    for smooth_value in [0, 1, 2]:
        # keep track of time
        start = time.time()

        co_occurrence_dict, word_to_id = build_co_occurrence_dict(synopses, window_size=window, smoothness = smooth_value)

        total_occurrences = sum(co_occurrence_dict.values())
        vocab_size = len(word_to_id)
        word_freq = [0] * vocab_size
        context_freq = [0] * vocab_size

        # Accumulate word and context frequencies
        for (word_id, context_id), count in co_occurrence_dict.items():
            word_freq[word_id] += count
            context_freq[context_id] += count

        # Now calculate PPMI values
        ppmi_dict = calculate_ppmi(co_occurrence_dict, word_to_id, total_occurrences, word_freq, context_freq)

        create_prediction_csv_ppmi(validation_data)
        end = time.time()

        # Calculate accuracy score
        accuracy = calculate_accuracy()
        hyperparameter_entry = {
            'window': window,
            'smoothness': smooth_value,
            'accuracy': np.round(accuracy, 2),
            'time(s)': np.round(end-start, 2),
        }
        print(hyperparameter_entry)

Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:11<00:00, 29286.32it/s]
Calculating PPMI: 100%|██████████| 3987350/3987350 [00:07<00:00, 508858.48it/s]


(cup, tableware) not found.
{'window': 1, 'smoothness': 0, 'accuracy': 0.6, 'time(s)': 271.03}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:10<00:00, 31715.18it/s]
Calculating PPMI: 100%|██████████| 3987350/3987350 [00:06<00:00, 611375.62it/s]


(cup, tableware) not found.
{'window': 1, 'smoothness': 1, 'accuracy': 0.59, 'time(s)': 230.97}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:11<00:00, 28898.14it/s]
Calculating PPMI: 100%|██████████| 3987350/3987350 [00:07<00:00, 506586.41it/s]


(cup, tableware) not found.
{'window': 1, 'smoothness': 2, 'accuracy': 0.58, 'time(s)': 231.71}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:17<00:00, 19621.99it/s]
Calculating PPMI: 100%|██████████| 7240898/7240898 [00:14<00:00, 484943.41it/s]


(cup, tableware) not found.
{'window': 2, 'smoothness': 0, 'accuracy': 0.58, 'time(s)': 392.34}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:17<00:00, 19776.80it/s]
Calculating PPMI: 100%|██████████| 7240898/7240898 [00:13<00:00, 553618.60it/s]


(cup, tableware) not found.
{'window': 2, 'smoothness': 1, 'accuracy': 0.58, 'time(s)': 389.91}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:18<00:00, 18515.07it/s]
Calculating PPMI: 100%|██████████| 7240898/7240898 [00:13<00:00, 544398.08it/s]


(cup, tableware) not found.
{'window': 2, 'smoothness': 2, 'accuracy': 0.57, 'time(s)': 385.06}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:23<00:00, 14632.08it/s]
Calculating PPMI: 100%|██████████| 9864240/9864240 [00:16<00:00, 580271.12it/s]


(cup, tableware) not found.
{'window': 3, 'smoothness': 0, 'accuracy': 0.58, 'time(s)': 445.62}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:22<00:00, 14766.77it/s]
Calculating PPMI: 100%|██████████| 9864240/9864240 [00:17<00:00, 569398.96it/s]


(cup, tableware) not found.
{'window': 3, 'smoothness': 1, 'accuracy': 0.57, 'time(s)': 448.6}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:22<00:00, 14772.74it/s]
Calculating PPMI: 100%|██████████| 9864240/9864240 [00:17<00:00, 578789.97it/s]


(cup, tableware) not found.
{'window': 3, 'smoothness': 2, 'accuracy': 0.58, 'time(s)': 442.9}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:27<00:00, 12169.43it/s]
Calculating PPMI: 100%|██████████| 12004420/12004420 [00:28<00:00, 424896.54it/s]


(cup, tableware) not found.
{'window': 4, 'smoothness': 0, 'accuracy': 0.56, 'time(s)': 570.04}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:29<00:00, 11518.48it/s]
Calculating PPMI: 100%|██████████| 12004420/12004420 [00:23<00:00, 506542.67it/s]


(cup, tableware) not found.
{'window': 4, 'smoothness': 1, 'accuracy': 0.57, 'time(s)': 569.34}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:29<00:00, 11280.31it/s]
Calculating PPMI: 100%|██████████| 12004420/12004420 [00:23<00:00, 515418.01it/s]


(cup, tableware) not found.
{'window': 4, 'smoothness': 2, 'accuracy': 0.57, 'time(s)': 574.95}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:32<00:00, 10461.31it/s]
Calculating PPMI: 100%|██████████| 13760942/13760942 [00:27<00:00, 503135.14it/s]


(cup, tableware) not found.
{'window': 5, 'smoothness': 0, 'accuracy': 0.59, 'time(s)': 645.0}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:33<00:00, 10067.20it/s]
Calculating PPMI: 100%|██████████| 13760942/13760942 [00:31<00:00, 433338.30it/s]


(cup, tableware) not found.
{'window': 5, 'smoothness': 1, 'accuracy': 0.59, 'time(s)': 646.87}


Building Co-occurrence Dictionary: 100%|██████████| 337763/337763 [00:38<00:00, 8790.14it/s] 
Calculating PPMI: 100%|██████████| 13760942/13760942 [00:28<00:00, 480839.36it/s]


(cup, tableware) not found.
{'window': 5, 'smoothness': 2, 'accuracy': 0.59, 'time(s)': 666.75}


## Results

Validation

In [75]:
%run task1_eval_script_student_version.py ./data/10726993-Task1-method-a-validation.csv ./data/Task-1-validation-dataset.csv


The following simalarity scores may need checking:
(absorb,learn) similarity score: 0.01178318673467594, gold ranking: 5.48
(absorb,withdraw) similarity score: 0.012244545804659047, gold ranking: 2.97
----------------------------
(acquire,get) similarity score: 0.030680234530073035, gold ranking: 8.82
(acquire,obtain) similarity score: 0.0742418186299178, gold ranking: 8.57
----------------------------
(arm,shoulder) similarity score: 0.07481171586764619, gold ranking: 4.85
(arm,body) similarity score: 0.0870836099598039, gold ranking: 4.05
----------------------------
(arm,shoulder) similarity score: 0.07481171586764619, gold ranking: 4.85
(arm,knee) similarity score: 0.08558231667302596, gold ranking: 2.75
----------------------------
(arm,shoulder) similarity score: 0.07481171586764619, gold ranking: 4.85
(arm,neck) similarity score: 0.11294280832778489, gold ranking: 1.58
----------------------------
(arm,body) similarity score: 0.0870836099598039, gold ranking: 4.05
(arm,neck) sim

# Method B : Word2Vec

In [None]:
# Train the Word2Vec model
model = Word2Vec(synopses, vector_size=100, window=1, min_count=1, workers=1, sg=1)

In [95]:
# Define a function to calculate cosine similarity using Word2Vec
def calculate_similarity(model, word1, word2):
    if word1 in model.wv.key_to_index and word2 in model.wv.key_to_index:
        return model.wv.similarity(word1, word2) ## uses cosine similarity internally
    else:
        # Returning 0.2 if one of the words is not in the vocabulary
        print(f"({word1}, {word2}) not found.")
        return 0.2

In [96]:
def create_prediction_csv_w2v(model, data, dataset):
  # Drop the last column
  prediction_data = data.copy()
  prediction_data.drop(prediction_data.columns[1:], axis=1, inplace=True)

  # Calculate similarity and add as a new column
  prediction_data[1] = data.apply(lambda row: calculate_similarity(model, row[1], row[2]), axis=1)

  if dataset == 'validation':
    # Save the new dataset to a CSV file
    prediction_path = './data/10726993-Task1-method-b-validation.csv'
    prediction_data.to_csv(prediction_path, index=False, header=False)
  if dataset == 'testing':
    # Save the new dataset to a CSV file
    prediction_path = './data/10726993-Task1-method-b.csv'
    prediction_data.to_csv(prediction_path, index=False, header=False)

In [100]:
create_prediction_csv_w2v(model, validation_data, "validation")

(cup, tableware) not found.


In [101]:
create_prediction_csv_w2v(model, testing_data, "testing")

(keep, possess) not found.
( war criminal, student) not found.
( war criminal, jet) not found.
(brutal murder, instructor) not found.
(brutal murder, terrible) not found.
(college graduate, teacher) not found.
(college graduate, job) not found.
(boy, teenage couple) not found.
(cat, teenage couple) not found.
(take, possess) not found.
(journey, long distance) not found.
(area, long distance) not found.


## Word2Vec Hyperparameter Tuning

Ignore this as it was for testing purposes ( code may not work if you run it ) also it took 2hours so don't bother

In [None]:
def calculate_accuracy():
  with capture_output() as c:
      %run task1_eval_script_student_version.py ./data/10726993-Task1-method-b-validation.csv ./data/Task-1-validation-dataset.csv
      accuracy = c.stdout.splitlines()[-1]
      accuracy = accuracy.split()[-1]
      return float(accuracy)

In [None]:
for epochs in tqdm.tqdm([5, 10, 30, 50]):
    for vs in [100, 150, 200, 300]:
        for window in [1,2,3,4,5]:
            for sg in [0, 1]:
                # keep track of time
                start = time.time()

                # Create word2vec model
                w2v_model = Word2Vec(synopses, vector_size=vs, window=window, min_count=1, workers=4, sg=sg, epochs=epochs)

                create_prediction_csv_w2v(w2v_model, validation_data, "validation")
                end = time.time()

                # Calculate accuracy score
                accuracy = calculate_accuracy()
                hyperparameter_entry = {
                    'epochs': epochs,
                    'vector size': vs,
                    'window': window,
                    'word2vec': "cbow" if sg == 0 else "skip-gram",
                    'accuracy': accuracy,
                    'time(s)': end-start
                }
                print(hyperparameter_entry)

  0%|          | 0/4 [00:00<?, ?it/s]

(cup, tableware)
{'epochs': 5, 'vector_size': 100, 'window': 1, 'word2vec': 'cbow', 'accuracy': 0.64, 'time(s)': 41.94}
(cup, tableware)
{'epochs': 5, 'vector_size': 100, 'window': 1, 'word2vec': 'skip-gram', 'accuracy': 0.66, 'time(s)': 58.58}
(cup, tableware)
{'epochs': 5, 'vector_size': 100, 'window': 2, 'word2vec': 'cbow', 'accuracy': 0.61, 'time(s)': 42.22}
(cup, tableware)
{'epochs': 5, 'vector_size': 100, 'window': 2, 'word2vec': 'skip-gram', 'accuracy': 0.63, 'time(s)': 75.84}
(cup, tableware)
{'epochs': 5, 'vector_size': 100, 'window': 3, 'word2vec': 'cbow', 'accuracy': 0.6, 'time(s)': 43.19}
(cup, tableware)
{'epochs': 5, 'vector_size': 100, 'window': 3, 'word2vec': 'skip-gram', 'accuracy': 0.61, 'time(s)': 96.93}
(cup, tableware)
{'epochs': 5, 'vector_size': 100, 'window': 4, 'word2vec': 'cbow', 'accuracy': 0.6, 'time(s)': 44.54}
(cup, tableware)
{'epochs': 5, 'vector_size': 100, 'window': 4, 'word2vec': 'skip-gram', 'accuracy': 0.61, 'time(s)': 111.98}
(cup, tableware)
{'ep

  0%|          | 0/4 [12:21<?, ?it/s]


KeyboardInterrupt: ignored

The accuracy when using vector_size = 300 & window = 1 is 0.6407407407407407 which is the highest until now.

The default is vector_size = 100 window = 5 ?
but I would say window = 1 works better than anything else
I think hyperparameter tuning is required for word2vec even for other paramters that has not been used yet.

using skip gram sg = 1, with vs 300, window = 1 gives Accuracy: 0.6481481481481481

## Results

In [98]:
%run task1_eval_script_student_version.py ./data/10726993-Task1-method-b-validation.csv ./data/Task-1-validation-dataset.csv

The following simalarity scores may need checking:
(absorb,learn) similarity score: 0.5250598788261414, gold ranking: 5.48
(absorb,withdraw) similarity score: 0.7771555781364441, gold ranking: 2.97
----------------------------
(acquire,get) similarity score: 0.5366950035095215, gold ranking: 8.82
(acquire,obtain) similarity score: 0.8529593348503113, gold ranking: 8.57
----------------------------
(apple,sauce) similarity score: 0.7262406349182129, gold ranking: 1.43
(apple,lemon) similarity score: 0.716759204864502, gold ranking: 4.05
----------------------------
(arm,body) similarity score: 0.43532636761665344, gold ranking: 4.05
(arm,vein) similarity score: 0.5751644372940063, gold ranking: 3.65
----------------------------
(arm,body) similarity score: 0.43532636761665344, gold ranking: 4.05
(arm,knee) similarity score: 0.6686338186264038, gold ranking: 2.75
----------------------------
(arm,body) similarity score: 0.43532636761665344, gold ranking: 4.05
(arm,bone) similarity score: