In [1]:
import csv  # For handling CSV files
import nltk  # Natural Language Toolkit for text processing
from nltk.tokenize import word_tokenize  # Tokenizer
from nltk.corpus import stopwords  # For stopwords
from nltk.stem import PorterStemmer  # Stemmer

# Define the path to the CSV file containing the dataset
csv_file_path = '/content/Training-dataset.csv'

# Initialize lists to store data from the CSV file
ids = []
titles = []
plots = []

# Open and read the CSV file
with open(csv_file_path, 'r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    # Loop through each row in the CSV file
    for row in csv_reader:
        # Append the relevant data to the respective lists
        ids.append(row['ID'])
        titles.append(row['title'])
        plots.append(row['plot_synopsis'])

# Import regular expression library
import re

def clean_plot(plot):
    # Remove non-alphabetic characters from the plot synopsis
    plot = re.sub(r'[^a-zA-Z ]', '', plot)
    return plot

# Download necessary NLTK packages
nltk.download('punkt')  # For tokenization
nltk.download('stopwords')  # For stopwords

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')  # Download WordNet database for lemmatization

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize a list of tokens
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Load English stopwords
stop_words = stopwords.words('english')

def process_plot(plot):
    cleaned_plot = clean_plot(plot)  # Clean the plot
    tokenized_plot = word_tokenize(cleaned_plot.lower())  # Tokenize the cleaned plot
    lemmatized_plot = lemmatize_tokens(tokenized_plot)  # Lemmatize the tokens
    # Remove stopwords from the plot
    filtered_plot = [word for word in lemmatized_plot if word not in stop_words]
    return filtered_plot

# Process each plot synopsis
processed_plots_tokens = [process_plot(plot) for plot in plots]

# Join the list of words back into strings
training_plots = [' '.join(plot_tokens) for plot_tokens in processed_plots_tokens]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# Method a) sparse representation (BoW with PPMI)

# Import CountVectorizer for feature extraction
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Flatten the list of tokens
tokens = [token for plot_tokens in processed_plots_tokens for token in plot_tokens]

# Get unique tokens
unique_tokens = set(tokens)
unique_tokens = list(unique_tokens)
print(len(unique_tokens))

# Initialize CountVectorizer with the unique tokens as vocabulary
vectorizer = CountVectorizer(vocabulary=unique_tokens)

# Fit and transform the data
dtm = vectorizer.fit_transform(training_plots)

# Import necessary modules for PPMI calculation
from sklearn.metrics import pairwise_distances
from scipy.sparse import csr_matrix

# Convert the term-document matrix to a sparse CSR matrix
dtm = csr_matrix(dtm)

def calculate_ppmi(dtm):
    # Calculate marginal probabilities
    sum_words = np.array(dtm.sum(axis=1)).flatten()
    sum_contexts = np.array(dtm.sum(axis=0)).flatten()
    total_sum = sum_words.sum()

    # Calculate Pointwise Mutual Information (PMI)
    pmi_matrix = np.log(((dtm * total_sum) / np.outer(sum_words, sum_contexts)).toarray() + 1e-8)

    # Calculate Positive PMI (PPMI) by removing negative values
    ppmi_matrix = np.maximum(pmi_matrix, 0)

    return ppmi_matrix

# Calculate PPMI matrix
ppmi_matrix = calculate_ppmi(dtm)

# Create a mapping of words to their indices
word_to_index = {word: index for index, word in enumerate(unique_tokens)}

# Function to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    similarity = 0

    # Check if both words are in the vocabulary
    if word1 in word_to_index and word2 in word_to_index:
      # Get indices of the words
      word1_index = word_to_index[word1]
      word2_index = word_to_index[word2]

      # Extract vectors
      vec1 = ppmi_matrix[:, word1_index].flatten()
      vec2 = ppmi_matrix[:, word2_index].flatten()

      # Compute cosine similarity
      dot_product = np.dot(vec1, vec2)
      norm_vec1 = np.linalg.norm(vec1)
      norm_vec2 = np.linalg.norm(vec2)
      similarity = dot_product / (norm_vec1 * norm_vec2)

    return similarity

150780


  recip = np.true_divide(1., other)


In [19]:
# Method b) dense static representation (word2vec)

# Import the Word2Vec model from gensim for word embeddings
from gensim.models import Word2Vec

# Initialize the Word2Vec model with specified parameters
model = Word2Vec(vector_size=150, window=5, min_count=1, workers=4, sg=0)

# Build the vocabulary from the processed plot tokens for the Word2Vec model
model.build_vocab(processed_plots_tokens)

# Train the Word2Vec model using the processed plot tokens
model.train(processed_plots_tokens, total_examples=model.corpus_count, epochs=10)

# Function to calculate similarity between two words using the Word2Vec model
def get_similarity_word2vec(word1, word2):
    similarity = 0
    # Check if both words are in the model's vocabulary
    if word1 in model.wv.key_to_index and word2 in model.wv.key_to_index:
        # Calculate and return the cosine similarity between the two words
        similarity = model.wv.similarity(word1, word2)
    return similarity


In [21]:
import csv

document_similarities = []

with open('/content/Task-1-validation-dataset.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        index, word1, word2, similarity = row
        document_similarities.append((index, word1, word2, float(similarity)))

similarities = []

for index, word1, word2, similarity in document_similarities:
    similarity_score = cosine_similarity(word1, word2)
    similarities.append((index, similarity_score))

filename = "10749545-Task1-method-a-validation.csv"
with open(filename, mode='w', newline='') as file:
    writer = csv.writer(file)

    for index, similarity in similarities:

        row = [index] + [similarity]
        writer.writerow(row)

In [10]:
%run '/content/task1_eval_script_student_version(1).py' /content/10749545-Task1-method-a-validation.csv /content/Task-1-validation-dataset.csv

The following simalarity scores may need checking:
(achieve,accomplish) similarity score: 0.013896852042201353, gold ranking: 8.57
(achieve,try) similarity score: 0.03784001613817241, gold ranking: 4.42
----------------------------
(acquire,get) similarity score: 0.01993848292200949, gold ranking: 8.82
(acquire,obtain) similarity score: 0.07760885941813246, gold ranking: 8.57
----------------------------
(acquire,get) similarity score: 0.01993848292200949, gold ranking: 8.82
(acquire,find) similarity score: 0.04968694998435809, gold ranking: 6.38
----------------------------
(apple,sauce) similarity score: 0.0, gold ranking: 1.43
(apple,sunshine) similarity score: 0.0, gold ranking: 0.58
----------------------------
(arm,shoulder) similarity score: 0.11749415179521437, gold ranking: 4.85
(arm,body) similarity score: 0.18248182200364838, gold ranking: 4.05
----------------------------
(arm,shoulder) similarity score: 0.11749415179521437, gold ranking: 4.85
(arm,neck) similarity score: 0

In [18]:
test_document_similarities = []

with open('/content/Task-1-test-dataset1.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        id, word1, word2 = row
        test_document_similarities.append((id, word1, word2))

test_similarities = []

for id, word1, word2 in test_document_similarities:
    similarity_score = cosine_similarity(word1, word2)
    test_similarities.append((id, similarity_score))

filename = "10749545-Task1-method-a.csv"
with open(filename, mode='w', newline='') as file:
    writer = csv.writer(file)

    for id, similarity in test_similarities:
        row = [id] + [similarity]
        writer.writerow(row)

In [24]:
similarities_word2vec = []

for id, word1, word2, similarity in document_similarities:
    similarity_score = get_similarity_word2vec(word1, word2)
    similarities_word2vec.append((id, similarity_score))

filename = "10749545-Task1-method-b-validation.csv"
with open(filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    for index, similarity in similarities_word2vec:
        writer.writerow([index, similarity])


In [26]:
%run '/content/task1_eval_script_student_version(1).py' /content/10749545-Task1-method-b-validation.csv /content/Task-1-validation-dataset.csv

The following simalarity scores may need checking:
(absorb,learn) similarity score: 0.20379506, gold ranking: 5.48
(absorb,withdraw) similarity score: 0.30235, gold ranking: 2.97
----------------------------
(acquire,get) similarity score: 0.15894678, gold ranking: 8.82
(acquire,obtain) similarity score: 0.7656169, gold ranking: 8.57
----------------------------
(apple,sauce) similarity score: 0.4492793, gold ranking: 1.43
(apple,lemon) similarity score: 0.33655682, gold ranking: 4.05
----------------------------
(arm,shoulder) similarity score: 0.62591, gold ranking: 4.85
(arm,neck) similarity score: 0.64391595, gold ranking: 1.58
----------------------------
(arm,body) similarity score: 0.25507197, gold ranking: 4.05
(arm,vein) similarity score: 0.37002456, gold ranking: 3.65
----------------------------
(arm,body) similarity score: 0.25507197, gold ranking: 4.05
(arm,knee) similarity score: 0.5776952, gold ranking: 2.75
----------------------------
(arm,body) similarity score: 0.255

In [27]:
word2vec_test_similarities = []

for id, word1, word2 in test_document_similarities:
    similarity_score = get_similarity_word2vec(word1, word2)
    word2vec_test_similarities.append((id, similarity_score))

filename = "10749545-Task1-method-b.csv"
with open(filename, mode='w', newline='') as file:
    writer = csv.writer(file)

    for id, similarity in word2vec_test_similarities:
        row = [id] + [similarity]
        writer.writerow(row)