# Imports

### Sources
https://scikit-learn.org/ <br>
https://www.nltk.org/ <br>
https://docs.python.org/3/library/re.html <br>
https://pandas.pydata.org/ <br>

In [None]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
nltk.download("punkt") # Download 'punkt' package which is a tokenizer model used to divide text into a list of sentences or words
nltk.download("wordnet") # Download 'wordnet' which is a large lexical database of English used by the WordNetLemmatizer
nltk.download("stopwords") # Download 'stopwords' which contains lists of stopwords for various languages


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
file_path = './data/Training-dataset.csv'
data = pd.read_csv(file_path)
# Combining title and plot synopsis
data['combined_text'] = data['title'] + " " + data['plot_synopsis']

# Data Cleaning and Pre-processing

### Stemming (not used in this code)

In [None]:
# stemmer = PorterStemmer()
# english_stopwords = set(stopwords.words('english'))
# for i in range(len(data)):
#   document_words = nltk.word_tokenize(data.loc[i, 'combined_text'])
#   document_words = [stemmer.stem(word) for word in document_words if word.lower() not in english_stopwords]
#   data.loc[i, 'combined_text'] = ' '.join(document_words)

In [None]:
from nltk.stem import WordNetLemmatizer
english_stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_and_lemmatize(word):
    document_words = re.sub('[^a-zA-Z]', ' ', word)                     # Remove non-alphabetical characters
    document_words = document_words.lower()                             # Case-folding
    document_words = nltk.word_tokenize(document_words)                 # Splits string text into words and punctuation
    document_words = [lemmatizer.lemmatize(w) for w in document_words if w not in english_stopwords] # Removes stopwords and lemmatizes the words
    return document_words, ' '.join(document_words)                     # Combine back to string and return both list of tokens and the combined string


for i in range(len(data)):
  # Process the text
  list_of_words, combined_text = preprocess_and_lemmatize(data.loc[i, 'combined_text'])
  data.at[i, 'normalized_list_of_words']  = ""
  # Assign the processed values to the DataFrame
  data.at[i, 'normalized_list_of_words'] = list_of_words
  data.at[i, 'normalized_combined_text'] = combined_text

# **Task 1**

### Sources
https://youtu.be/fM4qTMfCoak?si=WdDeyiYYDcpmQZ8j <br>
https://youtu.be/6ZVf1jnEKGI?si=i8MXzWCCzat6Jkpn <br>
https://youtu.be/JpxCt3kvbLk?si=-__OTDBXHf8PSSdl <br>
https://youtu.be/1OMmbtVmmbg?si=c5NG8VMk54xEw4TU <br>
https://youtu.be/cqcUk6hC5hk?si=6LbjPE0nyRsJf62s <br>
https://youtu.be/IKgBLTeQQL8?si=auBb3TbPjh8o1npr <br>
https://youtu.be/iu2-G_5YkEo?si=qdGAoshr00dXgmI6 <br>
https://youtu.be/D2V1okCEsiE?si=1ifhbqkMQvnL6noQ <br>
https://youtu.be/z9myrLOF_1M?si=l8kTT6l9uoBLtsGn <br>


# Approach 1: TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer_normalized = TfidfVectorizer()
tfidf_vectorizer_normalized_matrix = tfidf_vectorizer_normalized.fit_transform(data['normalized_combined_text'])  # Fit and Transform the vectorizer on the normalized training data


In [None]:
features_names = tfidf_vectorizer_normalized.get_feature_names_out()
tfidf_vectorizer_normalized_matrix_array= tfidf_vectorizer_normalized_matrix.toarray()

# Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def average_embedding(words):
    embeddings = []
    for word in words:
        try:
            if word in features_names:
                embedding = tfidf_vectorizer_normalized_matrix_array[:, np.where(word == features_names)]
                embeddings.append(embedding)
            else:
                continue
        except:
            continue
    if len(embeddings) == 0:
        return None
    return np.mean(embeddings, axis=0)



def calculate_similarity(word1, word2, verbose = False):
      # Preprocess the words
      word1_processed = preprocess_and_lemmatize(word1)[0]
      word2_processed = preprocess_and_lemmatize(word2)[0]

      # Get average embeddings for each word group
      tfidf_vec1 = average_embedding(word1_processed)
      tfidf_vec2 = average_embedding(word2_processed)

      # Check if embeddings are found
      if tfidf_vec1 is None or tfidf_vec2 is None:
          return 0

      # Calculate and print similarity
      similarity = cosine_similarity(tfidf_vec1.reshape(1,-1), tfidf_vec2.reshape(1,-1))[0][0]
      if verbose:
        print("Cosine similarity with normalized data vectorizer: " + str(similarity))
      return similarity





In [None]:
# Example usage
calculate_similarity("area", "region", verbose=True)

Cosine similarity with normalized data vectorizer: 0.12112313741514513


0.12112313741514513

### Development Dataset

In [None]:
# Initialize an empty list to store the results
results = []

# Read the validation dataset into a DataFrame without headers
validation_set = pd.read_csv("./data/Task-1-validation-dataset.csv", header=None)

# Iterate over each row in the validation set
for index, row in validation_set.iterrows():
    # Append a dictionary with term_pair_id and calculated similarity to the results list
    results.append({'term_pair_id': row[0], 'similarity': calculate_similarity(row[1], row[2])})

# Convert the results list into a DataFrame and Save the DataFrame to a CSV
results_df = pd.DataFrame(results)
results_df.to_csv('10556516-Task1-method-a.csv', index=False, header=False)


### Test Dataset

In [None]:
# Initialize an empty list to store the results
results = []

# Read the validation dataset into a DataFrame without headers
validation_set = pd.read_csv("Task-1-test-dataset1.csv", header=None)

# Iterate over each row in the validation set
for index, row in validation_set.iterrows():
    # Append a dictionary with term_pair_id and calculated similarity to the results list
    results.append({'term_pair_id': row[0], 'similarity': calculate_similarity(row[1], row[2])})

# Convert the results list into a DataFrame and Save the DataFrame to a CSV
results_df = pd.DataFrame(results)
results_df.to_csv('10556516-Task1-method-a-test.csv', index=False, header=False)


# Approach 2: Word2Vec

### Sources
https://github.com/krishnaik06/Stock-Sentiment-Analysis <br>
https://youtu.be/Otde6VGvhWM?si=mxnIu4mtGM4BZx79 <br>
https://youtu.be/h-LGjJ_oANs?si=GARBUzl_P6N2GOYV <br>


In [None]:
from gensim.models import Word2Vec

In [None]:
# Training the Word2Vec model
word2vecmodel = Word2Vec(data['normalized_list_of_words'], min_count=1, vector_size=100, window=5, workers=4)

# Cosine Similarity

In [None]:
def get_vector(word):
    # Check if the word is in the vocabulary
    if word in word2vecmodel.wv:
        # If the word exists in the Word2Vec model's vocabulary, return its vector
        return word2vecmodel.wv[word]
    else:
        # If the word does not exist in the vocabulary, return None
        return None

def calculate_average_vector(words):
    # Split the input string into individual words and retrieve their vectors
    vectors = [get_vector(word) for word in words.split()]
    # Filter out None values in case some words are not in the vocabulary
    vectors = [vec for vec in vectors if vec is not None]
    # If there are valid vectors, calculate and return the average vector
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return None

def calculate_word2vec_similarity(term1, term2):
    # Calculate the average vectors for each term
    vec1 = calculate_average_vector(term1)
    vec2 = calculate_average_vector(term2)

    # If vectors for both terms are found, calculate cosine similarity
    if vec1 is not None and vec2 is not None:
        return cosine_similarity([vec1], [vec2])[0][0]
    else:
        return 0  # One or both terms do not have vectors

In [None]:
print(calculate_word2vec_similarity("area", "region area"))

0.95567155


### Development Dataset

In [None]:
# Initialize an empty list to store the results
results = []

# Read the validation dataset into a DataFrame without headers
validation_set = pd.read_csv("./data/Task-1-validation-dataset.csv", header=None)

# Iterate over each row in the validation set
for index, row in validation_set.iterrows():
    results.append({'term_pair_id': row[0], 'similarity': calculate_word2vec_similarity(row[1], row[2])})

# Convert the results list into a DataFrame
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file without a header
results_df.to_csv('10556516-Task1-method-b.csv', index=False, header=False)

### Test Dataset

In [None]:
# Initialize an empty list to store the results
results = []

# Read the validation dataset into a DataFrame without headers
validation_set = pd.read_csv("./data/Task-1-test-dataset1.csv", header=None)

# Iterate over each row in the validation set
for index, row in validation_set.iterrows():
    results.append({'term_pair_id': row[0], 'similarity': calculate_word2vec_similarity(row[1], row[2])})

# Convert the results list into a DataFrame
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file without a header
results_df.to_csv('10556516-Task1-method-b-test.csv', index=False, header=False)