10699403 - Task 1 - Methods a and b

In [None]:
# Import necessary libraries
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import pandas as pd
import csv
from gensim.models import Word2Vec

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Function to preprocess data by tokenizing, lemmatizing, and removing stopwords and punctuation
def preprocess_data(data, training_data=True):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    final_tokens = []

    for document in data:
        # Convert the document to lowercase
        document = document.lower()
        # Tokenize the document
        tokens = word_tokenize(document)

        # Lemmatize tokens, remove punctuation and stopwords
        cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in string.punctuation and token not in stop_words]
        final_tokens.append(cleaned_tokens)

    if training_data:
        return final_tokens
    # Combine the tokens into a string for non-training data
    return ' '.join([' '.join(inner_list) for inner_list in final_tokens])

In [None]:
# Read training data from a CSV file
df = pd.read_csv('./data/Training-dataset.csv')
training_data = preprocess_data((df['title'] + ' ' + df['plot_synopsis']).tolist())

In [None]:
# Read in validation data from a CSV file and split into word pairs for TF-IDF and Word2Vec similarity scoring
word_pairs = []
with open('./data/Task-1-validation-dataset.csv', 'r') as csvfile:
  csvreader = csv.reader(csvfile)
  for row in csvreader:
    row[1] = preprocess_data([row[1]], training_data=False)
    row[2] = preprocess_data([row[2]], training_data=False)
    word_pairs.append(row)

In [None]:
# Initialize a TF-IDF vectorizer and transform the training data
tfidf_vec = TfidfVectorizer()
tfidf_data = []
for data in training_data:
    # Combine the lemmatized tokens into a string for each document
    tfidf_data.append(' '.join(data))
tfidf_mat = tfidf_vec.fit_transform(tfidf_data)
term_index_dict = {term: index for index, term in enumerate(tfidf_vec.get_feature_names_out())}

In [None]:
# Train a Word2Vec model on the preprocessed training data
word2vec_model = Word2Vec(sentences=training_data, vector_size=100, window=5, min_count=1, workers=4, epochs=5)

In [None]:
# Function to find the TF-IDF vector for a given term
def find_word_vec(term):
  words = term.split()
  # Extract TF-IDF vectors for each word in the term
  word_vectors = [tfidf_mat[:, term_index_dict.get(word, None)].toarray() for word in words if word in term_index_dict]
  if not word_vectors:
    # If none of the words are present in the TF-IDF matrix, return a zero vector
    return np.zeros((tfidf_mat.shape[1],))
  # Calculate the mean vector for the term
  mean_vector = np.mean(word_vectors, axis=0).flatten()
  return mean_vector

In [None]:
# Function to calculate TF-IDF cosine similarity score between two vectors
def tfidf_cosine_similarity_score(word1_vec, word2_vec):
  try:
     # Reshape vectors for cosine similarity calculation
    word1_vec = word1_vec.reshape(1, -1)
    word2_vec = word2_vec.reshape(1, -1)
    # Calculate cosine similarity
    return cosine_similarity(word1_vec, word2_vec)[0][0]
  except ValueError:
    # Handle the case where the vectors have incompatible shapes
    return 0.0

In [None]:
# Function to calculate Word2Vec cosine similarity score between two words
def word2vec_cosine_similarity_score(word1, word2):
  word1_list = word1.split()
  word2_list = word2.split()

  # Check if all words in both lists are out-of-vocabulary (OOV) in the Word2Vec model
  all_word1_oov = all(word not in word2vec_model.wv for word in word1_list)
  all_word2_oov = all(word not in word2vec_model.wv for word in word2_list)

  if all_word1_oov and all_word2_oov:
     # If both word lists are OOV, return a similarity score of 0
    return 0.0
  else:
    word1_vectors =[]
    word2_vectors =[]

    # Create a zero vector with the same shape as a word vector in the Word2Vec model
    zeros_vector = np.zeros_like(word2vec_model.wv.get_vector(word2vec_model.wv.index_to_key[0]).reshape(1, -1))

    if all_word1_oov:
      # If all words in word1 are OOV, append a zero vector to the list
      word1_vectors.append(zeros_vector)
    else:
      # Otherwise, append the Word2Vec vectors for each word in word1 to the list
      for word in word1_list:
        if word in word2vec_model.wv:
          word1_vectors.append(word2vec_model.wv[word].reshape(1, -1))
      # Extend the list with zero vectors to match the length of word1_list so that we have a vector for every word
      word1_vectors.extend([np.zeros_like(word1_vectors[0]) for _ in range(len(word1_list) - len(word1_vectors))])

    if all_word2_oov:
      # If all words in word1 are OOV, append a zero vector to the list
      word2_vectors.append(zeros_vector)
    else:
      # Otherwise, append the Word2Vec vectors for each word in word1 to the list
      for word in word2_list:
        if word in word2vec_model.wv:
          word2_vectors.append(word2vec_model.wv[word].reshape(1, -1))
      # Extend the list with zero vectors to match the length of word1_list so that we have a vector for every word
      word2_vectors.extend([np.zeros_like(word2_vectors[0]) for _ in range(len(word2_list) - len(word2_vectors))])

    # Calculate cosine similarity between the mean vectors of word1 and word2
    return cosine_similarity(np.mean(word1_vectors, axis=0), np.mean(word2_vectors, axis=0))[0][0]


In [None]:
tfidf_results = []
word2vec_results = []
for row in word_pairs:
  # Calculate the validation TF-IDF cosine similarity and store the result in tfidf_results
  tfidf_similarity = tfidf_cosine_similarity_score(find_word_vec(row[1]), find_word_vec(row[2]))
  tfidf_results.append([row[0], tfidf_similarity])

  # Calculate the validation Word2Vec cosine similarity and store the result in word2vec_results
  word2vec_similarity = word2vec_cosine_similarity_score(row[1], row[2])
  word2vec_results.append([row[0], word2vec_similarity])

In [None]:
# Write the validation TF-IDF results to a CSV file
with open('./data/10699403-Task1-method-a-validation.csv', 'w', newline='') as csvfile:
  csvwriter = csv.writer(csvfile)
  csvwriter.writerows(tfidf_results)

# Write the validation Word2Vec results to a CSV file
with open('./data/10699403-Task1-method-b-validation.csv', 'w', newline='') as csvfile:
  csvwriter = csv.writer(csvfile)
  csvwriter.writerows(word2vec_results)

In [None]:
test_word_pairs = []
# Read in test data from a CSV file and split into word pairs for TF-IDF and Word2Vec similarity scoring
with open('./data/Task-1-test-dataset1.csv', 'r') as csvfile:
  csvreader = csv.reader(csvfile)
  for row in csvreader:
    row[1] = preprocess_data([row[1]], training_data=False)
    row[2] = preprocess_data([row[2]], training_data=False)
    test_word_pairs.append(row)

In [None]:
tfidf_test_results = []
word2vec_test_results = []
for row in test_word_pairs:
  # Calculate the test TF-IDF cosine similarity and store the result in tfidf_results
  tfidf_similarity = tfidf_cosine_similarity_score(find_word_vec(row[1]), find_word_vec(row[2]))
  tfidf_test_results.append([row[0], tfidf_similarity])

  # Calculate the test Word2Vec cosine similarity and store the result in word2vec_results
  word2vec_similarity = word2vec_cosine_similarity_score(row[1], row[2])
  word2vec_test_results.append([row[0], word2vec_similarity])

In [None]:
# Write the test TF-IDF results to a CSV file
with open('./data/10699403-Task1-method-a.csv', 'w', newline='') as csvfile:
  csvwriter = csv.writer(csvfile)
  csvwriter.writerows(tfidf_test_results)

# Write the test Word2Vec results to a CSV file
with open('./data/10699403-Task1-method-b.csv', 'w', newline='') as csvfile:
  csvwriter = csv.writer(csvfile)
  csvwriter.writerows(word2vec_test_results)