# Dependencies

Necessary imports and installations.

In [None]:
import nltk
from nltk import *
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


# Load the WikiText-103 corpus

In [None]:
from zipfile import ZipFile
with ZipFile("./data.zip", "r") as zip:
    zip.extractall()
    print('Done')

Done


In [None]:
with open("./data/WikiText-103.txt", "r", encoding="utf-8") as file:
    corpus = file.read()

# Data Preprocessing

We break the corpus into sentences and clean each sentence. We train the model on words from each cleaned sentence.

In [None]:
sentences = sent_tokenize(corpus)

In [None]:
from spellchecker import SpellChecker
spell = SpellChecker()
def clean_sentences(sentence):
    # Remove punctuations and do case folding
    sentence_no_punctuation = re.sub(r'[^ a-zA-Z]', '', sentence)
    word_tokens = word_tokenize(sentence_no_punctuation.lower())
    # Remove Stop Words
    stop_words = set(stopwords.words('english'))
    tokens_no_stopwords = [word_token for word_token in word_tokens if word_token not in stop_words]
    # Keep valid english words and 'unk'
    english_words = [filtered_sent for filtered_sent in tokens_no_stopwords if filtered_sent in spell or filtered_sent == 'unk']

    normalized_sent = " ".join(english_words)
    return normalized_sent

In [None]:
print(len(sentences))
print(sentences[0:10])

1282834
['  \n = = Gameplay = = \n \n As with previous <unk> Chronicles games , Valkyria Chronicles III is a tactical role @-@ playing game where players take control of a military unit and take part in missions against enemy forces .', 'Stories are told through comic book @-@ like panels with animated character portraits , with characters speaking partially through voiced speech bubbles and partially through unvoiced text .', 'The player progresses through a series of linear missions , gradually unlocked as maps that can be freely scanned through and replayed as they are unlocked .', "The route to each story location on the map varies depending on an individual player 's approach : when one option is selected , the other is sealed off to the player .", 'Outside missions , the player characters rest in a camp , where units can be customized and character growth occurs .', 'Alongside the main story missions are character @-@ specific sub missions relating to different squad members .', 

In [None]:
cleaned_sentences = [clean_sentences(sentence) for sentence in sentences]

In [None]:
print("Cleaned Sentences:", cleaned_sentences[0:3])

Cleaned Sentences: ['previous unk chronicles games chronicles tactical role playing game players take control military unit take part missions enemy forces', 'stories told comic book like panels animated character portraits characters speaking partially voiced speech bubbles partially unvoiced text', 'player progresses series linear missions gradually unlocked maps freely scanned replayed unlocked']


In [None]:
input_sent = [clean_sentence.split() for clean_sentence in cleaned_sentences]

In [None]:
print('Input sentences:', input_sent[0:3])

Input sentences: [['previous', 'unk', 'chronicles', 'games', 'chronicles', 'tactical', 'role', 'playing', 'game', 'players', 'take', 'control', 'military', 'unit', 'take', 'part', 'missions', 'enemy', 'forces'], ['stories', 'told', 'comic', 'book', 'like', 'panels', 'animated', 'character', 'portraits', 'characters', 'speaking', 'partially', 'voiced', 'speech', 'bubbles', 'partially', 'unvoiced', 'text'], ['player', 'progresses', 'series', 'linear', 'missions', 'gradually', 'unlocked', 'maps', 'freely', 'scanned', 'replayed', 'unlocked']]


#Dense static representation of words using Word2Vec



## Model Training

We train our model for 10 epochs. For reproducibility, the number of workers has been fixed to 1.

In [None]:
model = Word2Vec(sentences = input_sent, vector_size = 100, window = 15, min_count=1, workers = 1, epochs = 10)

### Vocabulary Size

In [None]:
print(len(model.wv))

60866


In [None]:
# Method to calculate Cosine Similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0
    else:
        return dot_product / (norm_vec1 * norm_vec2)

### Handling multiwords and calculating similarity scores


For multiwords, we take the mean vector. For example, if the word is 'big data', we extract vectors for 'big' and 'data' and take the mean.

If a word is an OOV word, we replace assign the vector for 'unk' to it.

In [None]:
def oov_word(word_list):
    word_vec = []
    #Substitute OOV words by 'unk' else return their vector
    for w in word_list:
            if w not in model.wv:
                word_vec.append(model.wv['unk'])
            else:
                word_vec.append(model.wv[w])
    return word_vec


def similarity_scores(pairs):

    similarity_score_list = []
    for pair in pairs:

        word1 = pair[0]
        word2 = pair[1]

        # Handling multi-words by breaking them up.
        word1_list = word1.split()
        word2_list = word2.split()

        word1_vec = []
        word2_vec = []

        word1_vec = oov_word(word1_list)
        word2_vec = oov_word(word2_list)

        embedding_vector1 = np.mean(word1_vec, axis = 0)
        embedding_vector2 = np.mean(word2_vec, axis = 0)

        score = cosine_similarity(embedding_vector1, embedding_vector2)
        similarity_score_list.append(score)


    return similarity_score_list

### Experimenting on test/example file.

In [20]:
# Step 1: Read the CSV file
input_csv = './data/CW-1-testdata.csv'
df = pd.read_csv(input_csv, header=None)

# Step 2: Extract the values of the first three columns
new_df = df.iloc[:, :3].copy()
similarity_score_list = []


pairs = []

for i in range(len(df[1])):
    pairs.append([df[1][i], df[2][i]])


similarity_score_list = similarity_scores(pairs)

new_df['similarity_score'] = similarity_score_list

# Step 4: Write the new DataFrame to a new CSV file
output_csv = './data/11098060_task2_results.csv'
new_df.to_csv(output_csv, index=False, header=False)

print(f"New CSV file with similarity scores saved as {output_csv}")

New CSV file with similarity scores saved as ./data/11098060_task2_results.csv
