## Set torch.device to CUDA

In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is available. Using GPU.


## Download NLTK

In [2]:
import nltk
import os
CUSTOM_DIR = './nltk_data/'

nltk.data.path.append(CUSTOM_DIR)

resources = [
    ('reuters.zip', os.path.join(CUSTOM_DIR, 'corpora', 'reuters')),
    ('punkt.zip', os.path.join(CUSTOM_DIR, 'tokenizers', 'punkt')),
    ('punkt_tab.zip', os.path.join(CUSTOM_DIR, 'tokenizers', 'punkt_tab')),
    ('stopwords.zip', os.path.join(CUSTOM_DIR, 'corpora', 'stopwords')),
]

for resource, path in resources:
    try:
        nltk.data.find(resource)
    except LookupError:
        if not os.path.exists(path):
            nltk.download(resource.split('.')[0], download_dir=CUSTOM_DIR)


[nltk_data] Downloading package reuters to ./nltk_data/...
[nltk_data]   Package reuters is already up-to-date!


In [3]:
from nltk.corpus import reuters

category = "trade"
category_fileids = reuters.fileids(category)
category_docs = [reuters.raw(fileids) for fileids in category_fileids]

print(f"# of documents in category '{category}': {len(category_docs)}")
print(f"1st document:\n{category_docs[0][:500]}")

# of documents in category 'trade': 485
1st document:
ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict wo


In [4]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

In [5]:
preprocessed_docs = [preprocess_text(doc) for doc in category_docs]
preprocessed_docs

[['asian',
  'exporters',
  'fear',
  'damage',
  'u.s.-japan',
  'rift',
  'mounting',
  'trade',
  'friction',
  'u.s.',
  'japan',
  'raised',
  'fears',
  'among',
  'many',
  'asia',
  "'s",
  'exporting',
  'nations',
  'row',
  'could',
  'inflict',
  'far-reaching',
  'economic',
  'damage',
  'businessmen',
  'officials',
  'said',
  'told',
  'reuter',
  'correspondents',
  'asian',
  'capitals',
  'u.s.',
  'move',
  'japan',
  'might',
  'boost',
  'protectionist',
  'sentiment',
  'u.s.',
  'lead',
  'curbs',
  'american',
  'imports',
  'products',
  'exporters',
  'said',
  'conflict',
  'would',
  'hurt',
  'long-run',
  'short-term',
  'tokyo',
  "'s",
  'loss',
  'might',
  'gain',
  'u.s.',
  'said',
  'impose',
  '300',
  'mln',
  'dlrs',
  'tariffs',
  'imports',
  'japanese',
  'electronics',
  'goods',
  'april',
  '17',
  'retaliation',
  'japan',
  "'s",
  'alleged',
  'failure',
  'stick',
  'pact',
  'sell',
  'semiconductors',
  'world',
  'markets',
  'cost

In [6]:
len(preprocessed_docs)

485

In [7]:
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(preprocessed_docs)))
print(vocab)



In [8]:
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)



In [9]:
index2word = {v:k for k, v in word2index.items()} 
print(index2word)



In [10]:
voc_size = len(vocab)
print(voc_size)

8163


## Define Helper Functions

In [11]:
import numpy as np
from scipy import spatial
from scipy.stats import spearmanr


def cos_sim(a, b):
    return 1 - spatial.distance.cosine(
        a, b
    )  # distance = 1 - similarlity, because scipy only gives distance


def load_specific_categories(file_path, semantic_category, syntactic_category):
    semantic = []
    syntactic = []
    current_group = None

    with open(file_path, "r") as f:
        for line in f:
            if line.startswith(":"):
                if semantic_category in line.lower():
                    current_group = semantic
                elif syntactic_category in line.lower():
                    current_group = syntactic
                else:
                    current_group = None
            elif current_group is not None:
                words = line.strip().split()
                if len(words) == 4:
                    current_group.append(words)

    return semantic, syntactic


def find_closest_word(vec, embeddings, exclude_ids):
    max_similarity = -float("inf")
    best_idx = -1

    for idx, emb in enumerate(embeddings):
        if idx in exclude_ids:
            continue
        similarity = cos_sim(vec, emb)
        if similarity > max_similarity:
            max_similarity = similarity
            best_idx = idx

    return best_idx


def evaluate_analogies(analogy_data, word_to_idx, embeddings):
    correct = 0
    total = 0

    for word1, word2, word3, word4 in analogy_data:
        if all(word in word_to_idx for word in [word1, word2, word3, word4]):
            idx1 = word_to_idx[word1]
            idx2 = word_to_idx[word2]
            idx3 = word_to_idx[word3]
            idx4 = word_to_idx[word4]

            vec = embeddings[idx2] - embeddings[idx1] + embeddings[idx3]
            predicted_idx = find_closest_word(vec, embeddings, {idx1, idx2, idx3})
            if predicted_idx == idx4:
                correct += 1
            total += 1

    return correct / total if total > 0 else 0


def get_query_vector(query_word, model, word2index):
    if query_word not in word2index:
        raise ValueError(f"Word '{query_word}' not in vocabulary.")

    word_idx = word2index[query_word]
    query_vector = model.embedding_v.weight[word_idx].data.cpu().numpy()
    return query_vector


def get_corpus_vectors(model):
    return model.embedding_v.weight.data.cpu().numpy()


def compute_top_k_dot_product(query_vector, corpus_vectors, k=10):
    # get dot product between the query and all corpus vectors
    dot_products = np.dot(corpus_vectors, query_vector)

    # get the top k indices and scores
    top_k_indices = np.argsort(dot_products)[-k:][::-1]
    top_k_scores = dot_products[top_k_indices]

    return top_k_indices.tolist(), top_k_scores.tolist()


def load_wordsim353(file_path):
    word_pairs = []
    human_scores = []

    with open(file_path, "r") as f:
        next(f)
        for line in f:
            word1, word2, score = line.strip().split()
            word_pairs.append((word1, word2))
            human_scores.append(float(score))

    return word_pairs, human_scores


def calculate_model_similarity(word_pairs, model, word_to_idx):
    model_scores = []
    embeddings = model.embedding_v.weight.data.cpu().numpy()

    for word1, word2 in word_pairs:
        if word1 in word_to_idx and word2 in word_to_idx:
            idx1 = word_to_idx[word1]
            idx2 = word_to_idx[word2]
            dot_product = np.dot(embeddings[idx1], embeddings[idx2])
            model_scores.append(dot_product)
        else:
            model_scores.append(None)  # Handle OOV (out-of-vocabulary) words
    return model_scores


def compute_spearman_correlation(human_scores, model_scores):
    valid_scores = [(h, m) for h, m in zip(human_scores, model_scores) if m is not None]
    filtered_human_scores, filtered_model_scores = zip(*valid_scores)

    correlation, _ = spearmanr(filtered_human_scores, filtered_model_scores)
    return correlation

def compute_mse(human_scores, model_scores):
    #remove null values from model_scores
    valid_scores = [(h, m) for h, m in zip(human_scores, model_scores) if m is not None]
    filtered_human_scores, filtered_model_scores = zip(*valid_scores)

    mse = np.mean((np.array(filtered_model_scores) - np.array(filtered_human_scores)) ** 2)
    return mse

def compute_average_human_score(human_scores):
    return sum(human_scores) / len(human_scores)

In [12]:
dataset_path = 'word-analogies.txt'
semantic_category = 'capital-common-countries'
syntactic_category = 'past-tense'
semantic, syntactic = load_specific_categories(dataset_path, semantic_category, syntactic_category)
embedding_size = 2

## Word2Vec (Skipgram)

In [13]:
from torch import nn

class Skipgram(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)


    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words)  # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words)  # [batch_size, window_size, emb_size]
        all_embeds = self.embedding_u(all_vocabs)  # [batch_size, voc_size, emb_size]

        # [batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, window_size]
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)

        # [batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]
        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)

        # scalar (loss must be scalar)
        nll = -torch.mean(
            torch.log(
                torch.exp(scores) / torch.sum(torch.exp(norm_scores), 1).unsqueeze(1)
            )
        )

        return nll  # negative log likelihood

In [14]:
word2vec_skipgram = Skipgram(vocab_size=voc_size, emb_size=embedding_size)
word2vec_skipgram.load_state_dict(torch.load('models/word2vec_skipgram.pth'))
word2vec_skipgram.to(device)
word2vec_skipgram.eval()
print("Model loaded successfully.")

  word2vec_skipgram.load_state_dict(torch.load('models/word2vec_skipgram.pth'))


Model loaded successfully.


### syntactic and semantic accuracy Word2Vec(Skipgram)

In [15]:
embeddings_word2vec_skipgram = word2vec_skipgram.embedding_v.weight.data.cpu().numpy()
semantic_accuracy_word2vec_skipgram = evaluate_analogies(semantic, word2index, embeddings_word2vec_skipgram)
syntactic_accuracy_word2vec_skipgram = evaluate_analogies(syntactic, word2index, embeddings_word2vec_skipgram)
print(f'semantic_accuracy_word2vec_skipgram: {semantic_accuracy_word2vec_skipgram}')
print(f'syntactic_accuracy_word2vec_skipgram: {syntactic_accuracy_word2vec_skipgram}')

semantic_accuracy_word2vec_skipgram: 0
syntactic_accuracy_word2vec_skipgram: 0.0


### correlation between the models’ dot product and the provided similarity metrics Word2Vec(Skipgram)

In [16]:
word_pairs, human_scores = load_wordsim353('wordsim353/wordsim_similarity_goldstandard.txt')
model_scores = calculate_model_similarity(word_pairs, word2vec_skipgram, word2index)
correlation = compute_spearman_correlation(human_scores, model_scores)
print(f"Word2Vec(Skipgram) spearman correlation: {correlation:.4f}")

Word2Vec(Skipgram) spearman correlation: -0.0982


The model barely ranks word pairs in the exact opposite order of human judgment or roughly no correlation, which means that the model’s rankings are completely unrelated to human judgments

### MSE Word2Vec (Skipgram)

In [17]:
mse = compute_mse(human_scores, model_scores)
print(f"Word2Vec (Skipgram) MSE: {mse}")

Word2Vec (Skipgram) MSE: 26.60272801674853


### Top 10 most similar context Word2Vec(Skipgram)

In [18]:
# get top 10 most similar context Word2Vec(Skipgram)

query_vector = get_query_vector('money', word2vec_skipgram, word2index)
corpus_vector = get_corpus_vectors(word2vec_skipgram)   

top_k_indices, top_k_scores = compute_top_k_dot_product(query_vector, corpus_vector, k=10)
for idx in top_k_indices:
    print(index2word[idx])

34.76
lesser
out-of-date
views
detail
attractive
existing
govenment
students
pass


## Word2Vec (Negative Sampling)

In [19]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, emb_size) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                    
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, window_size, emb_size]
        neg_embeds    = -self.embedding_u(negative_words) # [batch_size, window_size * num_neg, emb_size]
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, window_size]
        
        negative_score = neg_embeds.bmm(center_embeds.transpose(1, 2))
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, k, 1]
        
        loss = self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score), 1)
                
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

In [20]:
word2vec_negativesampling = SkipgramNegSampling(vocab_size=voc_size, emb_size=embedding_size)
word2vec_negativesampling.load_state_dict(torch.load('models/word2vec_negativesampling.pth'))
word2vec_negativesampling.to(device)
word2vec_negativesampling.eval()
print("Model loaded successfully.")

Model loaded successfully.


  word2vec_negativesampling.load_state_dict(torch.load('models/word2vec_negativesampling.pth'))


### syntactic and semantic accuracy

In [21]:
# get semantic and syntacic accuracy for Word2Vec negative sampling

embeddings_word2vec_negativesampling = word2vec_negativesampling.embedding_v.weight.data.cpu().numpy()
semantic_accuracy_word2vec_negativesampling = evaluate_analogies(semantic, word2index, embeddings_word2vec_negativesampling)
syntactic_accuracy_word2vec_negativesampling = evaluate_analogies(syntactic, word2index, embeddings_word2vec_negativesampling)
print(f'semantic_accuracy_word2vec_negativesampling: {semantic_accuracy_word2vec_negativesampling}')
print(f'syntactic_accuracy_word2vec_negativesampling: {syntactic_accuracy_word2vec_negativesampling}')

semantic_accuracy_word2vec_negativesampling: 0
syntactic_accuracy_word2vec_negativesampling: 0.0


### correlation between the models’ dot product and the provided similarity metrics Word2Vec(Negative Sampling)

In [22]:
word_pairs, human_scores = load_wordsim353('wordsim353/wordsim_similarity_goldstandard.txt')
model_scores = calculate_model_similarity(word_pairs, word2vec_negativesampling, word2index)
correlation = compute_spearman_correlation(human_scores, model_scores)
print(f"Word2Vec (Negativesampling) spearman correlation: {correlation:.4f}")

Word2Vec (Negativesampling) spearman correlation: -0.0976


The model barely ranks word pairs in the exact opposite order of human judgment or roughly no correlation, which means that the model’s rankings are completely unrelated to human judgments

### MSE Word2Vec Negative Sampling

In [23]:
mse = compute_mse(human_scores, model_scores)
print(f"Word2Vec (Skipgram) MSE: {mse}")

Word2Vec (Skipgram) MSE: 29.36627531053967


### Top 10 most similar context Word2Vec(Negative Sampling)

In [24]:
# get top 10 most similar context for Word2Vec(Megative Sampling)

query_vector = get_query_vector('money', word2vec_negativesampling, word2index)
corpus_vector = get_corpus_vectors(word2vec_negativesampling)   

top_k_indices, top_k_scores = compute_top_k_dot_product(query_vector, corpus_vector, k=10)
for idx in top_k_indices:
    print(index2word[idx])

12.38
tables
scheme
choke
scored
proposals
respectively
rasmussen
featured
dealers


## Glove from Scratch

In [25]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size,embed_size):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embed_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, embed_size) # out embedding
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, window_size, emb_size]
        
        center_bias = self.v_bias(center_words).squeeze(1) 
        target_bias = self.u_bias(target_words).squeeze(1)
        
        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        
        #note that coocs already got log
        loss = weighting*torch.pow(inner_product +center_bias + target_bias - coocs, 2) # [batch_size, window_size]
        
        return torch.sum(loss) # scalar

In [26]:
glove = GloVe(vocab_size=voc_size, embed_size=embedding_size)
glove.load_state_dict(torch.load('models/glove_scratch.pth'))
glove.to(device)
glove.eval()
print("Model loaded successfully.")

Model loaded successfully.


  glove.load_state_dict(torch.load('models/glove_scratch.pth'))


### syntactic and semantic accuracy GloVe

In [27]:
embeddings_glove = glove.embedding_v.weight.data.cpu().numpy()
semantic_accuracy_glove = evaluate_analogies(semantic, word2index, embeddings_glove)
syntactic_accuracy_glove = evaluate_analogies(syntactic, word2index, embeddings_glove)
print(f'semantic_accuracy_glove: {semantic_accuracy_glove}')
print(f'syntactic_accuracy_glove: {syntactic_accuracy_glove}')

semantic_accuracy_glove: 0
syntactic_accuracy_glove: 0.0


### correlation between the models’ dot product and the provided similarity metrics Word2Vec(Skipgram)

In [28]:
word_pairs, human_scores = load_wordsim353('wordsim353/wordsim_similarity_goldstandard.txt')
model_scores = calculate_model_similarity(word_pairs, glove, word2index)
correlation = compute_spearman_correlation(human_scores, model_scores)
print(f"GloVe spearman correlation: {correlation:.4f}")

GloVe spearman correlation: -0.0921


The model barely ranks word pairs in the exact opposite order of human judgment or roughly no correlation, which means that the model’s rankings are completely unrelated to human judgments

### MSE GloVe

In [29]:
mse = compute_mse(human_scores, model_scores)
print(f"Word2Vec (Skipgram) MSE: {mse}")

Word2Vec (Skipgram) MSE: 28.35736177676441


### Top 10 most similar context GloVe

In [31]:
# get top 10 most similar context for GloVe

query_vector = get_query_vector('money', glove, word2index)
corpus_vector = get_corpus_vectors(glove)   

top_k_indices, top_k_scores = compute_top_k_dot_product(query_vector, corpus_vector, k=10)
for idx in top_k_indices:
    print(index2word[idx])

verging
1988.
wrapped
effective
dresdner
threatened
10.85
manufacturer
weinberger
83


In [32]:
y_true = compute_average_human_score(human_scores)
print(f'y_true Word Similarity: {y_true}')

y_true Word Similarity: 5.121584158415842
