In [82]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [83]:
#Load the training dataset
training_dataset_path = '/content/drive/MyDrive/Colab Notebooks/Training-dataset.csv'
training_df = pd.read_csv(training_dataset_path)

#Load the validation dataset
validation_dataset_path = './data/Task-1-validation-dataset.csv'
validation_df = pd.read_csv(validation_dataset_path, header=None)


test_dataset_path = "./data/Task-1-test-dataset2.csv"
test_df = pd.read_csv(test_dataset_path)

# print(training_df.head())
print(validation_df.head())


#load evaluation script
#%load '/content/drive/MyDrive/Colab Notebooks/task1_eval_script_student_version.py'


   0        1           2     3
0  1   absorb       learn  5.48
1  2   absorb    withdraw  2.97
2  3  achieve  accomplish  8.57
3  4  achieve         try  4.42
4  6  acquire         get  8.82


# Preprocessing the data

In [84]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)

    # Case folding
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens  # Make sure to return the tokens

# Apply the preprocessing function to your dataframe
training_df["processed_text"] = training_df['plot_synopsis'].apply(preprocess_text)


# Flatten all token lists in the 'processed_text' column to create a single list of words
all_words = [word for tokens in training_df['processed_text'] for word in tokens]

# Create a set of all unique words
unique_words = set(all_words)

# The vocabulary size is the number of unique words
vocabulary_size = len(unique_words)

print(f"The vocabulary size is: {vocabulary_size}")

The vocabulary size is: 129767


#tf*idf Implementation

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Initialize the tf*idf vectorizer
tfidf_vectorizer = TfidfVectorizer()

#Then combine the preprocced texts
combined_text = [" ".join(doc) for doc in training_df['processed_text']]

#transform combined text to tf*idf reprsentation
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_text)


#tfidf test


In [86]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


# Assuming tfidf_vectorizer and tfidf_matrix are already defined
feature_names = tfidf_vectorizer.get_feature_names_out()
similarities = []

# Iterate over the term pairs
for _, row in test_df.iterrows():

    term_pair_id = row[0]  # First column is the term_pair_id
    term1 = row[1]  # Second column contains the first term
    term2 = row[2]  # Third column contains the second term

    # Check if the terms are in the feature_names
    if term1 in feature_names and term2 in feature_names:
        # Find the index of the terms
        term1_index = np.where(feature_names == term1)[0][0]
        term2_index = np.where(feature_names == term2)[0][0]

        # Extract the TF-IDF vectors for each term
        term1_vector = tfidf_matrix[:, term1_index].reshape(1, -1)
        term2_vector = tfidf_matrix[:, term2_index].reshape(1, -1)

        # Calculate cosine similarity
        similarity_score = cosine_similarity(term1_vector, term2_vector)[0][0]
    else:
        # Assign a default similarity score for out-of-vocabulary terms
        similarity_score = 0.55

    # Append the term_pair_id and similarity score to the list
    similarities.append((term_pair_id, similarity_score))

# Create a DataFrame with the similarities list
formatted_similarities_df = pd.DataFrame(similarities)

# Save to CSV without the header and index
csv_file_path = "./data/tfidf_results.csv"
formatted_similarities_df.to_csv(csv_file_path, header=False, index=False, float_format='%.6f')


#Validation Dataset

In [87]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


# Assuming tfidf_vectorizer and tfidf_matrix are already defined
feature_names = tfidf_vectorizer.get_feature_names_out()
similarities = []

# Iterate over the term pairs
for _, row in validation_df.iterrows():
    term_pair_id = row[0]  # First column is the term_pair_id
    term1 = row[1]  # Second column contains the first term
    term2 = row[2]  # Third column contains the second term



    # Check if the terms are in the feature_names
    if term1 in feature_names and term2 in feature_names:
        # Find the index of the terms
        term1_index = np.where(feature_names == term1)[0][0]
        term2_index = np.where(feature_names == term2)[0][0]

        # Extract the TF-IDF vectors for each term
        term1_vector = tfidf_matrix[:, term1_index].reshape(1, -1)
        term2_vector = tfidf_matrix[:, term2_index].reshape(1, -1)

        # Calculate cosine similarity
        similarity_score = cosine_similarity(term1_vector, term2_vector)[0][0]
    else:
        # Assign a default similarity score for out-of-vocabulary terms
        similarity_score = 0.55

    # Append the term_pair_id and similarity score to the list
    similarities.append((term_pair_id, similarity_score))

# Create a DataFrame with the similarities list
formatted_similarities_df = pd.DataFrame(similarities)

# Save to CSV without the header and index
csv_file_path = "./data/tfidf_validation_results.csv"
formatted_similarities_df.to_csv(csv_file_path, header=False, index=False, float_format='%.6f')


In [None]:
# Running the evaluation script in a Colab cell
!python './data/task1_eval_script_student_version.py' "./data/tfidf_validation_results.csv" "./data/Task-1-validation-dataset.csv"


Pre-process data again.


In [89]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)

    # Case folding
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return lemmatized_tokens

# Apply the preprocessing function to your dataframe
training_df["processed_text"] = training_df['plot_synopsis'].apply(preprocess_text)

# Continue with calculating the vocabulary size as before


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Word2Vec Test



In [91]:
import pandas as pd
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine



# Ensure sentences is a list of lists (each inner list is a tokenized and preprocessed synopsis)
sentences = training_df['processed_text'].tolist()

# Train a Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

similarities = []

# Iterate over the term pairs
for _, row in test_df.iterrows():
    term_pair_id = row[0]  # Use the term pair ID from the first column of test_df
    term1 = row[1]  # Second column contains the first term
    term2 = row[2]  # Third column contains the second term

    # Check if the terms are in the model
    if term1 in model.wv.key_to_index and term2 in model.wv.key_to_index:
        # Calculate cosine similarity
        similarity_score = 1 - cosine(model.wv[term1], model.wv[term2])
    else:
        # Assign a default similarity score for out-of-vocabulary terms
        similarity_score = 0.55  # Adjust as needed

    # Store the term pair ID and similarity score
    similarities.append((term_pair_id, float(similarity_score)))

# Create and save the formatted DataFrame with only term_pair_id and similarity score
formatted_similarities_df = pd.DataFrame(similarities, columns=None)

# Save to CSV with float_format to ensure numerical values are formatted correctly
csv_file_path = "./data/Word2Vec_results.csv"
formatted_similarities_df.to_csv(csv_file_path, header=False, index=False, float_format='%.6f')


#Validation Word2Vec

In [92]:
import pandas as pd
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine

# Apply the preprocessing function to your dataframe
training_df["processed_text"] = training_df['plot_synopsis'].apply(preprocess_text)

# Ensure sentences is a list of lists (each inner list is a tokenized and preprocessed synopsis)
sentences = training_df['processed_text'].tolist()


# Train a Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


# Reset the index if necessary
#validation_df.reset_index(drop=True, inplace=True)

# Process for cosine similarity calculation
similarities = []

# Iterate over the term pairs
for _, row in validation_df.iterrows():
    term_pair_id = row[0]  # First column is the term_pair_id
    term1 = row[1]  # Second column contains the first term
    term2 = row[2]  # Third column contains the second term

    # Check if the terms are in the model
    if term1 in model.wv.key_to_index and term2 in model.wv.key_to_index:
        # Calculate cosine similarity
        similarity_score = 1 - cosine(model.wv[term1], model.wv[term2])
    else:
        # Assign a default similarity score for out-of-vocabulary terms
        similarity_score = 0.55

    # Store the term pair ID and similarity score
    similarities.append((term_pair_id, float(similarity_score)))

# Create and save the formatted DataFrame with only term_pair_id and similarity score
formatted_similarities_df = pd.DataFrame(similarities, columns=None)

# Save to CSV with float_format to ensure numerical values are formatted correctly
csv_file_path = "./data/Word2Vec_validation_results.csv"
formatted_similarities_df.to_csv(csv_file_path, header=False, index=False, float_format='%.6f')


In [None]:
# Running the evaluation script in a Colab cell
!python './data/task1_eval_script_student_version.py' "./data/Word2Vec_validation_results.csv" "./data/Task-1-validation-dataset.csv"


