# Dependencies

Necessary imports and installations.

In [None]:
import nltk
from nltk import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import re
import numpy as np
import string
import pandas as pd

In [None]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Load the WikiText-103 corpus

In [None]:
from zipfile import ZipFile
with ZipFile("./data.zip", "r") as zip:
    zip.extractall()
    print('Done')

Done


In [None]:
with open("./data/WikiText-103.txt", "r", encoding="utf-8") as file:
    corpus = file.read()

# Data Preprocessing

We break the corpus into sentences and clean each sentence. The cleaned sentences become our context for PPMI.

In [None]:
spell = SpellChecker()
def clean_sentences(sentence):
    # Remove punctuations and do casefolding
    article_no_punctuation = re.sub(r'[^ a-zA-Z]', '', sentence)
    word_tokens = word_tokenize(article_no_punctuation.lower())
    # Remove Stop Words
    stop_words = set(stopwords.words('english'))
    tokens_no_stopwords = [word_token for word_token in word_tokens if word_token not in stop_words]
    # Keep valid english words and 'unk'
    english_words = [filtered_sent for filtered_sent in tokens_no_stopwords if filtered_sent in spell or filtered_sent == 'unk']
    normalized_sent = " ".join(english_words)
    return normalized_sent

In [None]:
sentences = sent_tokenize(corpus)

In [None]:
print(sentences[0:1])

['  \n = = Gameplay = = \n \n As with previous <unk> Chronicles games , Valkyria Chronicles III is a tactical role @-@ playing game where players take control of a military unit and take part in missions against enemy forces .']


In [None]:
cleaned_sentences = [clean_sentences(sentence) for sentence in sentences]

In [None]:
print(cleaned_sentences[0:2])

['previous unk chronicles games chronicles tactical role playing game players take control military unit take part missions enemy forces', 'stories told comic book like panels animated character portraits characters speaking partially voiced speech bubbles partially unvoiced text']


# Create the Vocabulary for PPMI

In [None]:
cleaned_set = " ".join(cleaned_sentences)

In [None]:
cleaned_set_tokenized = cleaned_set.split()

In [None]:
print(cleaned_set_tokenized[0:5])

['previous', 'unk', 'chronicles', 'games', 'chronicles']


In [None]:
vocab = set(cleaned_set_tokenized)

In [None]:
print(len(vocab))

60866


# Sparse static representation using PPMI.

We construct a $k \times |V|$ PPMI matrix where $k$ is the number of unique words in the input file, $|V|$ is the size of the vocabulary.

Acknowledgement: https://stackoverflow.com/questions/58701337/how-to-construct-ppmi-matrix-from-a-text-corpus

For multiwords, we take the mean vector. For example, if the word is 'big data', we extract vectors for 'big' and 'data' and take the mean.

If a word is an OOV word, we replace assign the PPMI vector of 'unk' to it.

In [None]:
def co_occurrence_count_for_vocab(sentences, vocab, word_list, window_size):
    co_occurrences = defaultdict(lambda: defaultdict(int))
    total_words = 0
    word_freq = defaultdict(int)
    # Count co-occurrences and frequencies
    for text in sentences:
        words = text.lower().split()
        total_words += len(words)

        for i, token in enumerate(words):
            word_freq[token] += 1
            if token in vocab:  # Only consider words in vocab
                context_words = words[max(0, i-window_size):i] + words[i+1:i+1+window_size]
                for context_word in context_words:
                    if context_word in vocab:
                        co_occurrences[token][context_word] += 1

    return co_occurrences, word_freq, total_words




def compute_ppmi_vector(word, vocab, co_occurrences, word_freq, total_words):
    ppmi_vector = []
    freq_word = word_freq[word]

    for v in vocab:
        if v == word:
            ppmi_vector.append(0)
            continue

        co_occurrence_w_v = co_occurrences[word].get(v, 0)
        freq_v = word_freq[v]
        # 0.9 to prevent log(0).
        pmi = np.log2(0.9 + ((co_occurrence_w_v * total_words) /(freq_word * freq_v)))
        ppmi_vector.append(max(0, pmi))

    return ppmi_vector




def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0
    else:
        return dot_product / (norm_vec1 * norm_vec2)




def ppmi_matrix_for_pairs(sentences, vocab, pairs, window_size):
    # Collect all unique words from the pairs
    all_words = [word for pair in pairs for phrase in pair for word in phrase.split()]
    unique_words = set(all_words)
    unique_words.add('unk')
    # Co-occurrence counts, word frequencies, and total word count
    co_occurrences, word_freq, total_words = co_occurrence_count_for_vocab(sentences, vocab, unique_words, window_size)
    # Create a dictionary to store PPMI vectors for each unique word
    ppmi_vectors = {}

    # Handle unknown words by replacing them with 'unk'
    for word in unique_words:
        if word not in vocab:
            ppmi_vectors[word] = compute_ppmi_vector('unk', vocab, co_occurrences, word_freq, total_words)
        else:
            ppmi_vectors[word] = compute_ppmi_vector(word, vocab, co_occurrences, word_freq, total_words)

    # Calculate cosine similarities for all pairs
    similarity_score_list = []

    # Handle multi-words by breaking them up and taking the mean vector.
    for word1, word2 in pairs:
        word1_list = word1.split()
        word2_list = word2.split()
        ppmi_vector_word1 = np.mean([ppmi_vectors[w] for w in word1_list], axis = 0)
        ppmi_vector_word2 = np.mean([ppmi_vectors[w] for w in word2_list], axis = 0)

        # Calculate cosine similarity between the two words
        cos_sim = cosine_similarity(ppmi_vector_word1, ppmi_vector_word2)
        similarity_score_list.append(cos_sim)

    return similarity_score_list




# Experiments on the test/example file

In [28]:
# Read the CSV file
input_csv = './data/CW-1-testdata.csv'
df = pd.read_csv(input_csv, header=None)

# Extract the values of the first three columns
new_df = df.iloc[:, :3].copy()

similarity_score_list = []

pairs = []

for i in range(len(df[1])):
    pairs.append([df[1][i], df[2][i]])


new_df['similarity_score'] = ppmi_matrix_for_pairs(cleaned_sentences, vocab, pairs, window_size = 2)

# Write the new DataFrame to a new CSV file
output_csv = './data/11098060_task1_results.csv'
new_df.to_csv(output_csv, index=False, header=False)

print(f"New CSV file with similarity scores saved as {output_csv}")

New CSV file with similarity scores saved as ./data/11098060_task1_results.csv
