# Task 1

## Imports

In [15]:
import nltk
import pandas as pd
import numpy as np
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## a) Sparse Represntation BoW with tf*idf

### TFIDFSimilarityCalculator Class

In [16]:
class CosineSimilarityTFIDF:
  """
  A class for calculating cosine similarity using TF-IDF representation.

  This class reads training and validation datasets, preprocesses the training data, and provides methods
  for calculating cosine similarity between terms based on their TF-IDF vectors.
  """

  def __init__(self, training_data_path, validation_data_path):
    """
    Initializes the CosineSimilarityTFIDF class.

    The constructor reads training and validation datasets from CSV files, preprocesses the training data,
    and calculates the TF-IDF matrix, feature names, and TF-IDF array.

    input parameters:
    training_data_path - str
        The file path to the CSV file containing the training dataset.
    validation_data_path - str
        The file path to the CSV file containing the validation dataset.
    """
    self.training_data = pd.DataFrame(pd.read_csv(training_data_path))
    self.validation_data = pd.read_csv(validation_data_path, header=None, names=['index', 'term1', 'term2', 'goldscore'])

    self.processed_data = [self._preprocessing(synopsis) for synopsis in self.training_data['plot_synopsis']]
    self.tfidf_matrix, self.feature_names, self.tfidf_vectorizer= self._calculate_tfidf()

  def _preprocessing(self, raw):
    """
    The function takes in a string and tokenises it.
    Next it applies a lemmatizer from the import WordNetLemmatizer.
    Given each tokenised word, it checks whether the word only contains alphabetical letters as well as making it lower case to prevent duplicates.
    A list of the preprocessed tokens is returned ready to be used for training.

    input paramters:
    raw - string

    output:
    preprocessed_string - string

    """
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(raw)
    tokens = [lemmatizer.lemmatize(t.lower()) for t in tokens if t.isalpha()]
    return ' '.join(tokens)

  def _calculate_tfidf(self):
    """
    The function initializes a TF-IDF vectorizer from the scikit-learn library.
    It then fits and transforms the preprocessed data using the vectorizer to obtain a TF-IDF matrix.
    Feature names are extracted from the vectorizer, and the TF-IDF matrix is converted to a NumPy array.

    output:
    tfidf_matrix - scipy.sparse.csr_matrix
        The TF-IDF matrix representing the document-term matrix.
    feature_names - numpy.ndarray
        An array containing the feature names (unique words) corresponding to the columns of the TF-IDF matrix.

    """
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform((self.processed_data))
    feature_names = tfidf_vectorizer.get_feature_names_out()

    return tfidf_matrix, feature_names, tfidf_vectorizer

  def tfidf_sim(self, term1, term2):
    """
    This method first preprocesses both terms and checks if they exist in the TF-IDF vectorizer's vocabulary.
    Next, it retrieves the indices of the terms in the TF-IDF matrix.
    Then the function extracts the TF-IDF vectors for the two terms.
    After that, it calculates the cosine similarity between the vectors.

    If either term is not in the vocabulary, the similarity is set to 0.

    input parameters:
    term1 - str
        The first term for cosine similarity calculation.
    term2 - str
        The second term for cosine similarity calculation.

    output:
    similarity - float
        The cosine similarity between the TF-IDF representations of the two terms.
    """


    idx_term1 = self.tfidf_vectorizer.vocabulary_.get(term1)
    idx_term2 = self.tfidf_vectorizer.vocabulary_.get(term2)

    # If both terms are in the vocabulary, calculating cosine similarity
    if idx_term1 is not None and idx_term2 is not None:
        # Extracting the TF-IDF vectors for the two terms
        vector1 = self.tfidf_matrix.getcol(idx_term1)
        vector2 = self.tfidf_matrix.getcol(idx_term2)


        similarity = cosine_similarity(vector1.T, vector2.T)[0][0]
    else:
        # If either term is not in the vocabulary, return 0
        similarity = 0

    return similarity


  def calculate_cosine_similarity_validation_tfidf(self, row):
    """
    The function takes a DataFrame row containing 'term1' and 'term2', and computes the cosine similarity
    between the two terms using the tf*idf method.

    input parameters:
    row - pandas.Series
        A row from the validation dataset DataFrame containing 'term1' and 'term2' columns.

    output:
    similarity_result - float
        The cosine similarity between the TF-IDF representations of 'term1' and 'term2'.
    """
    pair = (row['term1'], row['term2'])
    similarity_result = self.tfidf_sim(row['term1'], row['term2'])
    return similarity_result

  def run_validation_tfidf(self, output_path):
    """
    The function applies the 'calculate_cosine_similarity_validation_tfidf' method to each row in the validation dataset.
    Then it creates a DataFrame with the results and saves it to a csv file to the output path.

    input parameters:
    output_path - str
        The file path where the TF-IDF cosine similarity results will be saved in CSV format.
    """
    self.validation_data['cosine_result_tfidf'] = self.validation_data.apply(self.calculate_cosine_similarity_validation_tfidf, axis=1)
    output_data_tfidf = self.validation_data[['index', 'cosine_result_tfidf']]
    output_data_tfidf.to_csv(output_path, index=False, header=False)

### Create The Class
This will do the preprocessing and sets up for running validation

In [17]:
calculator_tfidf_validation = CosineSimilarityTFIDF('/content/drive/MyDrive/data/Training-dataset.csv',
                                       '/content/drive/MyDrive/data/Task-1-validation-dataset.csv')

This will do the preprocessing and sets up for running test

In [18]:
calculator_tfidf_test = CosineSimilarityTFIDF('/content/drive/MyDrive/data/Training-dataset.csv',
                                       '/content/drive/MyDrive/data/Task-1-test-dataset1.csv')

In [19]:
print(len(calculator_tfidf_test.feature_names))

6175856


### Caluclate cosine Similarity
validation:

In [20]:
calculator_tfidf_validation.run_validation_tfidf('/content/drive/MyDrive/data/10867903-Task1-method-a-validation.csv')

test:

In [23]:
calculator_tfidf_test.run_validation_tfidf('/content/drive/MyDrive/data/10867903-Task1-method-a.csv')

### Run Validation
Check the accuracy of the model

## b) Dense Static Represntation Word2Vec

### Word2VecSimilarityCalculator Class

In [26]:
class Word2VecSimilarityCalculator:
  def __init__(self, training_data_path, validation_data_path):
    """
    Initializes the Word2VecSimilarityCalculator class.

    The constructor reads training and validation datasets from CSV files, preprocesses the training data,
    and calculates the word2vec model.

    input parameters:
    training_data_path - str
        The file path to the CSV file containing the training dataset.
    validation_data_path - str
        The file path to the CSV file containing the validation dataset.
    """
    self.training_data = pd.DataFrame(pd.read_csv(training_data_path))
    self.validation_data = pd.read_csv(validation_data_path, header=None, names=['index', 'term1', 'term2', 'goldscore'])

    self.processed_data = [self._preprocessing(synopsis) for synopsis in self.training_data['plot_synopsis']]
    self.tokenized_data = [word_tokenize(doc) for doc in self.processed_data]
    self.model = self._train_word2vec_model(self.tokenized_data)

  def _preprocessing(self, raw):
    """
    The function takes in a string and tokenises it.
    Next it applies a lemmatizer from the import WordNetLemmatizer.
    Given each tokenised word, it checks whether the word only contains alphabetical letters as well as making it lower case to prevent duplicates.
    A list of the preprocessed tokens is returned ready to be used for training.

    input paramters:
    raw - string

    output:
    preprocessed_string - string

    """
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(raw)
    tokens = [lemmatizer.lemmatize(t.lower()) for t in tokens if t.isalpha()]
    return ' '.join(tokens)

  def _train_word2vec_model(self, tokenized_data):
    """
    The function takes in the tokenized data and retuns the trained word2vec model.
    The hyperparaemters used are based on previously experimented values.

    input parameters:
    tokenized_data - list of lists
        tokenized data from preprocessing

    output:
    word2vec_model - gensim.models.Word2Vec
        The trained Word2Vec model.
    """
    return gensim.models.Word2Vec(
        sentences=tokenized_data,
        vector_size=100,
        window=7,
        min_count=1,
        sg=1,
        epochs=6
    )

  def get_word2vec_representation(self, term):
    """
    The function checks the type of the term. If it is a list, then checks for each word in the list if it is in the word2vec model, then append the representation of the word to term_vector.
    Otherwise, if the term is a single string, then check if that word is in the word2vec model, then append the representation of the word to term_vector.
    Finally, rturn the average of the word2vec representations in term_vector if it's not empty.
    Otherwise, returns None.


    input parameters:
    term - str or list
        If it's a string, it represents a single-word term or a preprocessed term.
        If it's a list, it represents a multi-word term, and the function retrieves

    output:
    term_vector - numpy.ndarray or None
        The Word2Vec representation of the term. If the term is not present
        in the Word2Vec model vocabulary, returns None.

    """
    term_vector = []
    if isinstance(term, list):
        # Multi-word term
        for word in term:
            if word in self.model.wv:
                term_vector.append(self.model.wv[word])
    elif isinstance(term, str):
        # Single-word term or preprocessed term
        tokens = word_tokenize(term)
        for word in tokens:
            if word in self.model.wv:
                term_vector.append(self.model.wv[word])

    return sum(term_vector) / len(term_vector) if term_vector else None

  def calculate_cosine_similarity(self, pair):
    """
    This function takes a pair of terms, preprocesses and tokenizes them, and obtains their word2vec
    representations using the `get_word2vec_representation` method.
    Then it calculates the cosine similarity between the two term vectors using the `cosine_similarity` function.

    input parameters:
    pair - tuple
        A pair of terms for which the cosine similarity is to be calculated.

    output:
    cosine_similarity - float
        The cosine similarity between the two terms.
    """
    term1, term2 = pair
    term1_vector = self.get_word2vec_representation(word_tokenize(self._preprocessing(term1)))
    term2_vector = self.get_word2vec_representation(word_tokenize(self._preprocessing(term2)))
    if term1_vector is not None and term2_vector is not None:
        similarity = cosine_similarity([term1_vector], [term2_vector])[0][0]
        return similarity
    else:
        return 0.0

  def calculate_cosine_similarity_validation_word2vec(self, row):
    """
    The function takes a DataFrame row containing 'term1' and 'term2', and computes the cosine similarity
    between the two terms using the word2vec method.

    input parameters:
    row - pandas.Series
        A row from the validation dataset DataFrame containing 'term1' and 'term2' columns.

    output:
    similarity_result - float
        The cosine similarity between the word2vec representations of 'term1' and 'term2'.
    """
    pair = (row['term1'], row['term2'])
    similarity_result = self.calculate_cosine_similarity(pair)
    return similarity_result

  def run_validation(self, output_path):
    """
    The function applies the 'calculate_cosine_similarity_validation_word2vec' method to each row in the validation dataset.
    Then it creates a DataFrame with the results and saves it to a csv file to the output path.

    input parameters:
    output_path - str
        The file path where the word2vec cosine similarity results will be saved in CSV format.
    """
    self.validation_data['cosine_result'] = self.validation_data.apply(self.calculate_cosine_similarity_validation_word2vec, axis=1)
    output_data = self.validation_data[['index', 'cosine_result']]
    output_data.to_csv(output_path, index=False, header=False)



### Create The Class
This will do the preprocessing and sets up for running validation

In [27]:
calculator_w2v_validation = Word2VecSimilarityCalculator('/content/drive/MyDrive/data/Training-dataset.csv',
                                         '/content/drive/MyDrive/data/Task-1-validation-dataset.csv')

This will do the preprocessing and sets up for running test

In [32]:
calculator_w2v_test = Word2VecSimilarityCalculator('/content/drive/MyDrive/data/Training-dataset.csv',
                                         '/content/drive/MyDrive/data/Task-1-test-dataset1.csv')

### Caluclate cosine Similarity
validation:

In [29]:
calculator_w2v_validation.run_validation('/content/drive/MyDrive/data/10867903-Task1-method-b-validation.csv')

test:

In [30]:
calculator_w2v_test.run_validation('/content/drive/MyDrive/data/10867903-Task1-method-b.csv')