<h1>Importing necessary libraries<h1>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import re
from collections import Counter
import nltk

nltk.download("brown")

from nltk.corpus import brown

[nltk_data] Downloading package brown to
[nltk_data]     /Users/anushkaojha/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
#Setting up device 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Ensuring Reproducibility
np.random.seed(42)

In [4]:
#Loading the "news" category from Brown corpus
corpus = brown.sents(categories="news")

In [5]:
#Taking a small subset for faster training.
corpus = corpus[:500]


In [6]:
#Flattening the corpus for vocabulary building
flatten = lambda l: [item for sublist in l for item in sublist]
news_flatten = flatten(corpus)

print("Number of unique tokens:", len(news_flatten))

Number of unique tokens: 11711


In [7]:
#Building vocabulary and adding <UNK>
counts = Counter(news_flatten)
vocab = sorted(counts.keys(), key=lambda w: (-counts[w], w))
vocab.append("<UNK>")


In [8]:
#Mapping the word and index 
word2index = {w: i for i, w in enumerate(vocab)}
index2word = {i: w for w, i in word2index.items()}

vocab_size = len(vocab)
print("Vocab size (including <UNK>):", vocab_size)

Vocab size (including <UNK>): 2947


Creating function to generate random training data

In [9]:
def random_batch(batch_size, corpus, window_size=2):

    skip_grams = []
    unk = word2index["<UNK>"]


    # Loop through each sentence in the corpus
    for sent in corpus:

        # We start from index = window_size and stop at len(sent) - window_size
        # so that each target word has a full context window on both sides
        for i in range(window_size, len(sent) - window_size):

            # Center (target) word
            target = word2index[sent[i]]

            # Collect context words within the window
            context = []
            for w in range(1, window_size + 1):
                # Left context
                context.append(word2index[sent[i - w]])
                # Right context
                context.append(word2index[sent[i + w]])

            # Create skip-gram pairs (target, context)
            for ctx_word in context:
                skip_grams.append([target, ctx_word])

    # Randomly sample skip-gram pairs to form a mini-batch
    random_inputs = []
    random_labels = []

    random_index = np.random.choice(
        range(len(skip_grams)), batch_size, replace=False
    )

    for idx in random_index:
        random_inputs.append([skip_grams[idx][0]])  # center word
        random_labels.append([skip_grams[idx][1]])  # context word

    return np.array(random_inputs), np.array(random_labels)

In [10]:
batch_size = 2
input_batch, target_batch = random_batch(batch_size, corpus, window_size=2)
print("Input batch:", input_batch)
print("Target batch:", target_batch)

Input batch: [[   3]
 [1763]]
Target batch: [[1]
 [7]]


<h1>Skipgram without negative sampling<h1>

In [11]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(vocab_size, emb_size)  # v vectors
        self.embedding_outside = nn.Embedding(vocab_size, emb_size)  # u vectors

    def forward(self, center, outside, all_vocab):
        """
        center:  [batch_size, 1]
        outside: [batch_size, 1]
        all_vocab: [batch_size, vocab_size] (same vocab list repeated across batch)
        """
        # Get embeddings
        v = self.embedding_center(center)          # [B, 1, D]
        u_o = self.embedding_outside(outside)      # [B, 1, D]
        u_all = self.embedding_outside(all_vocab)  # [B, V, D]

        # Numerator score: u_o dot v
        # [B,1,D] @ [B,D,1] -> [B,1,1] -> squeeze -> [B,1]
        numerator = torch.exp(u_o.bmm(v.transpose(1, 2)).squeeze(2))  # [B,1]

        # Denominator: sum_{w in vocab} exp(u_w dot v)
        # [B,V,D] @ [B,D,1] -> [B,V,1] -> squeeze -> [B,V]
        denom_scores = u_all.bmm(v.transpose(1, 2)).squeeze(2)        # [B,V]
        denominator = torch.sum(torch.exp(denom_scores), dim=1, keepdim=True)  # [B,1]

        # Negative log likelihood (scalar loss)
        loss = -torch.mean(torch.log(numerator / denominator))
        return loss

In [12]:
# Training setup
import torch.optim as optim
batch_size = 2
embedding_size = 2
window_size = 2
model = Skipgram(vocab_size, embedding_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

def prepare_sequence(seq, word2index):
    idxs = [
        word2index[w] if word2index.get(w) is not None else word2index["<UNK>"]
        for w in seq
    ]
    return torch.LongTensor(idxs)

In [13]:
all_vocabs = prepare_sequence(list(vocab), word2index) \
                .expand(batch_size, len(vocab)) \
                .to(device)

print("all_vocabs shape:", all_vocabs.shape)

all_vocabs shape: torch.Size([2, 2947])


In [14]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

Training

In [15]:
import time

num_epochs = 5000

for epoch in range(num_epochs):

    start = time.time()

    # Get a random mini-batch of Skip-gram pairs
    # window_size is now configurable (Task 1 requirement)
    input_batch, target_batch = random_batch(
        batch_size, corpus, window_size=window_size
    )

    # Convert numpy arrays to torch tensors
    input_batch  = torch.LongTensor(input_batch).to(device)   # [batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device)  # [batch_size, 1]

    # Zero gradients from previous step
    optimizer.zero_grad()

    # Forward pass: compute Skip-gram loss
    loss = model(input_batch, target_batch, all_vocabs)

    # Backpropagation
    loss.backward()

    # Update model parameters
    optimizer.step()

    end = time.time()

    # Compute elapsed time for this epoch
    epoch_time_ms = (end - start) * 1000

    # Print training progress every 1000 epochs (same as professor)
    if (epoch + 1) % 1000 == 0:
        print(
            f"Epoch: {epoch + 1} | "
            f"cost: {loss.item():.6f} | "
            f"time: {epoch_time_ms:.2f} ms"
        )

Epoch: 1000 | cost: 8.358099 | time: 12.14 ms
Epoch: 2000 | cost: 7.793187 | time: 11.80 ms
Epoch: 3000 | cost: 7.661410 | time: 62.18 ms
Epoch: 4000 | cost: 6.302114 | time: 11.88 ms
Epoch: 5000 | cost: 7.772816 | time: 64.14 ms


In [16]:
print(vocab[20:30])


['is', 'as', 'has', 'it', 'not', 'will', 'at', 'with', 'an', 'his']


In [23]:
# Cosine Similarity for Skip-gram embeddings
from numpy import dot
from numpy.linalg import norm
def get_embed_skip_gram(word):

    idx = word2index.get(word, word2index["<UNK>"])
    idx_tensor = torch.LongTensor([idx]).to(device)

    v_embed = model.embedding_center(idx_tensor)
    u_embed = model.embedding_outside(idx_tensor)

    word_embed = (v_embed + u_embed) / 2.0
    return word_embed.squeeze(0).detach().cpu().numpy()

def cos_sim(a, b):
    return dot(a, b) / (norm(a) * norm(b))


In [24]:
election = get_embed_skip_gram("election")
vote = get_embed_skip_gram("vote")
campaign = get_embed_skip_gram("campaign")

In [25]:
print(f"election vs vote:        {cos_sim(election, vote):.4f}")
print(f"election vs campaign:   {cos_sim(election, campaign):.4f}")
print(f"election vs election:   {cos_sim(election, election):.4f}")

election vs vote:        0.4850
election vs campaign:   -0.0720
election vs election:   1.0000


saving the skipgram model without negative sampling

In [26]:
# Save the model
import pickle
torch.save(model.state_dict(), 'model/skipgram_model.pth')
pickle.dump(model, open('model/skipgram.pkl', 'wb'))

<h1>Word2Vec (Negative Sampling)<h1>

In [27]:
#Building unigram table
Z = 0.001

In [28]:
flatten = lambda l: [item for sublist in l for item in sublist]
word_count = Counter(flatten(corpus))
num_total_words = sum(word_count.values())

In [29]:
num_total_words

11711

In [30]:
unigram_table = []
for w in vocab:
    uw = word_count[w] / max(1, num_total_words)     # unigram prob
    uw_alpha = int((uw ** 0.75) / Z)                 # apply 0.75 smoothing
    if uw_alpha > 0:
        unigram_table.extend([w] * uw_alpha)

print("Unigram table size:", len(unigram_table))

Unigram table size: 3623


In [31]:
Counter(unigram_table)

Counter({'the': 120,
         ',': 87,
         '.': 85,
         'of': 75,
         'to': 65,
         'a': 52,
         'in': 49,
         'and': 46,
         '``': 33,
         'for': 33,
         'that': 33,
         "''": 33,
         'The': 28,
         'said': 27,
         'be': 25,
         'would': 24,
         'on': 24,
         'was': 23,
         'by': 22,
         'he': 21,
         'is': 21,
         'as': 18,
         'has': 17,
         'it': 15,
         'not': 15,
         'will': 14,
         'at': 14,
         'with': 14,
         'an': 13,
         'his': 13,
         'been': 13,
         'which': 12,
         'He': 12,
         'this': 11,
         '--': 11,
         'Mr.': 11,
         'more': 11,
         'have': 10,
         'who': 10,
         'from': 10,
         'President': 9,
         'administration': 9,
         'its': 9,
         'year': 9,
         'are': 9,
         'had': 9,
         'or': 9,
         'State': 9,
         'Texas': 9,
         'electi

In [32]:
def prepare_sequence(seq, word2index):
    unk = word2index["<UNK>"]
    idxs = [word2index.get(w, unk) for w in seq]
    return torch.LongTensor(idxs)

def negative_sampling(targets, unigram_table, k):
    if targets.dim() == 2:
        targets_1d = targets.squeeze(1)
    else:
        targets_1d = targets

    batch_size = targets_1d.size(0)
    neg_samples = []

    for i in range(batch_size):
        target_index = targets_1d[i].item()
        nsample = []

        while len(nsample) < k:
            neg_word = random.choice(unigram_table)  # sampled token (string)
            neg_idx = word2index[neg_word]
            if neg_idx == target_index:
                continue
            nsample.append(neg_word)

        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))  # [1,k]

    return torch.cat(neg_samples, dim=0)

In [33]:
class SkipgramNegSampling(nn.Module):

    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)  # center embedding (v)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)  # outside embedding (u)
        self.logsigmoid = nn.LogSigmoid()

    def forward(self, center_words, target_words, negative_words):

        # Look up embeddings
        center_embeds = self.embedding_v(center_words)            # [B, 1, D]
        target_embeds = self.embedding_u(target_words)            # [B, 1, D]

        # NOTE: negative sign is applied here (same as professor)
        neg_embeds = -self.embedding_u(negative_words)            # [B, K, D]

        # Positive score: u_target · v_center -> [B, 1]
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        # [B,1,D] @ [B,D,1] -> [B,1,1] -> squeeze -> [B,1]

        # Negative score: (-u_neg) · v_center -> [B, K, 1]
        negative_score = neg_embeds.bmm(center_embeds.transpose(1, 2))
        # [B,K,D] @ [B,D,1] -> [B,K,1]

        # log σ(pos) + sum_k log σ(neg)
        loss = self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score), dim=1)
        # positive_score: [B,1]
        # negative_score: [B,K,1] -> logsigmoid keeps shape -> sum over K -> [B,1]

        return -torch.mean(loss)  # scalar

    def prediction(self, inputs):
        return self.embedding_v(inputs)

Training

In [35]:
import random
batch_size     = 64      # use 2 for debugging; increase for real training
embedding_size = 100     # 2 only for plotting; use 50/100/200 for better quality
window_size    = 2       # Task 1 default
num_neg        = 10      # k negative samples (prof uses 10)

model_neg = SkipgramNegSampling(vocab_size, embedding_size).to(device)
optimizer = optim.Adam(model_neg.parameters(), lr=0.001)

num_epochs = 5000

for epoch in range(num_epochs):
    start = time.time()

    # sample skip-gram pairs
    input_batch, target_batch = random_batch(batch_size, corpus, window_size=window_size)

    # to tensors
    input_tensor  = torch.LongTensor(input_batch).to(device)   # [B,1]
    target_tensor = torch.LongTensor(target_batch).to(device)  # [B,1]

    # sample negatives (on CPU first, then move to device)
    negs = negative_sampling(target_tensor.cpu(), unigram_table, num_neg).to(device)  # [B,k]

    optimizer.zero_grad()
    loss = model_neg(input_tensor, target_tensor, negs)
    loss.backward()
    optimizer.step()

    end = time.time()

    if (epoch + 1) % 1000 == 0:
        # printing ms because per-epoch can be < 1 sec
        print(f"Epoch: {epoch+1} | cost: {loss.item():.6f} | time: {(end-start)*1000:.2f} ms")

Epoch: 1000 | cost: 31.965076 | time: 13.34 ms
Epoch: 2000 | cost: 24.614790 | time: 13.19 ms
Epoch: 3000 | cost: 21.269592 | time: 13.15 ms
Epoch: 4000 | cost: 14.814827 | time: 70.79 ms
Epoch: 5000 | cost: 10.852766 | time: 68.66 ms


In [36]:
def get_embed_neg_sample(word):
    idx = word2index.get(word, word2index["<UNK>"])
    idx_tensor = torch.LongTensor([idx]).to(device)

    v_embed = model_neg.embedding_v(idx_tensor)  # [1, D]
    u_embed = model_neg.embedding_u(idx_tensor)  # [1, D]

    word_embed = (v_embed + u_embed) / 2.0
    return word_embed.squeeze(0).detach().cpu().numpy()

In [37]:
election = get_embed_neg_sample("election")
vote = get_embed_neg_sample("vote")
campaign = get_embed_neg_sample("campaign")

In [38]:
print(f"election vs vote:        {cos_sim(election, vote):.4f}")
print(f"election vs campaign:   {cos_sim(election, campaign):.4f}")
print(f"election vs election:   {cos_sim(election, election):.4f}")

election vs vote:        -0.0587
election vs campaign:   -0.0373
election vs election:   1.0000


In [39]:
# Saving the model
torch.save(model_neg.state_dict(), 'model/skipgram_neg_model.pth')
pickle.dump(model_neg, open('model/skipgram_neg.pkl', 'wb'))

<h1>Glove<h1>

In [40]:
from collections import Counter
#Counting unigram frequencies
flatten = lambda l: [item for sublist in l for item in sublist]
X_i = Counter(flatten(corpus)) 

In [41]:
#Building skip-grams and co-occurrence counts X_ik with dynamic window size
def build_cooccurrence(corpus, window_size=2):
    skip_grams = []
    X_ik_skipgram = Counter()

    for sent in corpus:
        # skip edges that don't have full context window
        for i in range(window_size, len(sent) - window_size):
            target = sent[i]

            # context within +/- window_size (both sides)
            for w in range(1, window_size + 1):
                left = sent[i - w]
                right = sent[i + w]

                # add (target, context)
                skip_grams.append((target, left))
                skip_grams.append((target, right))

                X_ik_skipgram[(target, left)] += 1
                X_ik_skipgram[(target, right)] += 1

    return X_ik_skipgram, skip_grams

In [42]:
window_size = 2
X_ik_skipgram, skip_grams = build_cooccurrence(corpus, window_size=window_size)

In [43]:
#GloVe weighting function f(x)
def weighting_count(x_ij, x_max=100, alpha=0.75):
    if x_ij < x_max:
        return (x_ij / x_max) ** alpha
    return 1.0

X_ik = {}          
weighting_dic = {}  


for (wi, wj), cnt in X_ik_skipgram.items():
    c = cnt + 1
    X_ik[(wi, wj)] = c
    X_ik[(wj, wi)] = c  

    w_val = weighting_count(c) 
    weighting_dic[(wi, wj)] = w_val
    weighting_dic[(wj, wi)] = w_val

print(f"Built co-occurrence pairs: {len(X_ik)} (window_size={window_size})")

Built co-occurrence pairs: 30394 (window_size=2)


In [46]:
# -------------------------
import math
def random_batch_glove(batch_size, skip_grams, X_ik, weighting_dic, word2index):
    unk = word2index["<UNK>"]

    # sample random indices
    batch_size = min(batch_size, len(skip_grams))
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)

    random_inputs = []
    random_labels = []
    random_coocs = []
    random_weightings = []

    for idx in random_index:
        wi, wj = skip_grams[idx]

        wi_id = word2index.get(wi, unk)
        wj_id = word2index.get(wj, unk)

        random_inputs.append([wi_id])
        random_labels.append([wj_id])

        # co-occurrence count (already symmetric in X_ik)
        cooc = X_ik.get((wi, wj), 1)
        random_coocs.append([math.log(cooc)])  # GloVe uses log(X_ij)

        # weighting value
        wt = weighting_dic.get((wi, wj), weighting_count(1))
        random_weightings.append([wt])

    return (np.array(random_inputs),
            np.array(random_labels),
            np.array(random_coocs, dtype=np.float32),
            np.array(random_weightings, dtype=np.float32))

In [48]:
batch_size = 2
input_b, target_b, cooc_b, weight_b = random_batch_glove(
    batch_size, skip_grams, X_ik, weighting_dic, word2index
)

print("Input:", input_b)
print("Target:", target_b)
print("Cooc (log):", cooc_b)
print("Weighting:", weight_b)

Input: [[1946]
 [  53]]
Target: [[1]
 [0]]
Cooc (log): [[1.0986123]
 [2.1972246]]
Weighting: [[0.07208434]
 [0.16431677]]


In [49]:
class GloVe(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(GloVe, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embed_size)  # center
        self.embedding_u = nn.Embedding(vocab_size, embed_size)  # outside

        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)

    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_v(center_words)  # [B,1,D]
        target_embeds = self.embedding_u(target_words)  # [B,1,D]

        center_bias = self.v_bias(center_words).squeeze(1)  # [B,1]
        target_bias = self.u_bias(target_words).squeeze(1)  # [B,1]

        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)  # [B,1]

        # coocs is log(X_ij), weighting is f(X_ij)
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)

        return torch.mean(loss)  # better than sum for comparisons

Training

In [50]:
# GloVe training setup

batch_size     = 10  # mini-batch size
embedding_size = 2   # small for visualization; increase later for real training

model_glove = GloVe(vocab_size, embedding_size).to(device)

# GloVe defines its own loss internally (weighted MSE),
# so NO external criterion is needed.
optimizer = optim.Adam(model_glove.parameters(), lr=0.001)

In [51]:
num_epochs = 5000

for epoch in range(num_epochs):

    start = time.time()

    # Getting a random GloVe batch 
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch_glove(
        batch_size,
        skip_grams,
        X_ik,
        weighting_dic,
        word2index
    )

    # Converting to tensors 
    input_batch     = torch.LongTensor(input_batch).to(device)     # [B,1]
    target_batch    = torch.LongTensor(target_batch).to(device)    # [B,1]
    cooc_batch      = torch.FloatTensor(cooc_batch).to(device)     # [B,1] (log X_ij)
    weighting_batch = torch.FloatTensor(weighting_batch).to(device)# [B,1]

    
    optimizer.zero_grad()
    loss = model_glove(input_batch, target_batch, cooc_batch, weighting_batch)
    loss.backward()
    optimizer.step()

    end = time.time()

    # ---- logging ----
    if (epoch + 1) % 1000 == 0:
        print(
            f"Epoch: {epoch + 1} | "
            f"cost: {loss.item():.6f} | "
            f"time: {(end - start) * 1000:.2f} ms"
        )

Epoch: 1000 | cost: 0.236655 | time: 1.56 ms
Epoch: 2000 | cost: 0.019608 | time: 1.54 ms
Epoch: 3000 | cost: 0.177698 | time: 1.51 ms
Epoch: 4000 | cost: 0.136542 | time: 1.52 ms
Epoch: 5000 | cost: 0.134668 | time: 1.55 ms


In [52]:
# save the model
torch.save(model_glove.state_dict(), 'model/glove_model.pth')
# save the model using pickle
pickle.dump(model_glove, open('model/glove.pkl', 'wb'))

In [53]:
def get_embed_glove(word):
    idx = word2index.get(word, word2index["<UNK>"])
    idx_tensor = torch.LongTensor([idx]).to(device)

    v_embed = model_glove.embedding_v(idx_tensor)  # [1, D]
    u_embed = model_glove.embedding_u(idx_tensor)  # [1, D]

    word_embed = (v_embed + u_embed) / 2.0
    return word_embed.squeeze(0).detach().cpu().numpy()

In [54]:
election = get_embed_glove("election")
vote = get_embed_glove("vote")
campaign = get_embed_glove("campaign")

In [55]:
print(f"election vs vote:        {cos_sim(election, vote):.4f}")
print(f"election vs campaign:   {cos_sim(election, campaign):.4f}")
print(f"election vs election:   {cos_sim(election, election):.4f}")

election vs vote:        0.2825
election vs campaign:   -0.5699
election vs election:   1.0000


In [56]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = 'glove.6B/glove.6B.100d.txt'
model_genism = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [57]:
# Example: Word similarity

similarity = model_genism.similarity('government', 'administration')
print(f"Similarity between 'government' and 'administration': {similarity:.4f}")

similarity = model_genism.similarity('election', 'vote')
print(f"Similarity between 'election' and 'vote': {similarity:.4f}")


# Example: Word analogy 
result = model_genism.most_similar(
    positive=['president', 'woman'],
    negative=['man'],
    topn=1
)
print("President - Man + Woman =", result[0][0])


result = model_genism.most_similar(
    positive=['government', 'state'],
    negative=['country'],
    topn=1
)
print("Government - Country + State =", result[0][0])

Similarity between 'government' and 'administration': 0.7937
Similarity between 'election' and 'vote': 0.8465
President - Man + Woman = vice
Government - Country + State = federal


<h1>Task 2. Model Comparison and Analysis<h1>

1. Comparing Skip-gram, Skip-gram negative sampling, GloVe models on training loss, training time

| Model                         | Training Loss (Epoch 5000) | Training Time per Epoch |
|------------------------------|----------------------------|--------------------------|
| Skip-gram (Full Softmax)     | 7.77                       | 162.14 ms                |
| Skip-gram (Negative Sampling)| 10.85                      | 179.13 ms               |
| GloVe                        | 0.13                      | 7.68 ms             |

The Skip-gram model without negative sampling shows relatively high training loss and inconsistent training time because it uses a full softmax over the entire vocabulary at each update, making it computationally expensive and less scalable. Skip-gram with negative sampling significantly improves efficiency by replacing the full softmax with a small number of negative samples, resulting in faster training and a steadily decreasing loss, although its loss values are not directly comparable due to a different objective function. GloVe achieves the lowest training loss and the fastest training time per epoch because it optimizes a global co-occurrence-based weighted least squares objective, allowing faster convergence once the co-occurrence matrix is constructed. Overall, Skip-gram with negative sampling provides the best balance between efficiency and representation quality, while GloVe is the most computationally efficient during training.

2. Using Word analogies dataset

In [58]:
# Load semantic analogy dataset

semantic_dataset = []

with open("capital-common-countries.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

for line in lines:
    words = line.strip().lower().split()
    if len(words) == 4:
        semantic_dataset.append([words[0], words[1], words[2], words[3]])

print("Number of semantic analogies:", len(semantic_dataset))
print("Sample:", semantic_dataset[:3])

Number of semantic analogies: 506
Sample: [['athens', 'greece', 'baghdad', 'iraq'], ['athens', 'greece', 'bangkok', 'thailand'], ['athens', 'greece', 'beijing', 'china']]


In [59]:
# Load syntactic analogy dataset (past tense)

past_tense_dataset = []

with open("gram7-past-tense.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

for line in lines:
    words = line.strip().lower().split()
    if len(words) == 4:
        past_tense_dataset.append([words[0], words[1], words[2], words[3]])

print("Number of syntactic analogies:", len(past_tense_dataset))
print("Sample:", past_tense_dataset[:3])

Number of syntactic analogies: 1560
Sample: [['dancing', 'danced', 'decreasing', 'decreased'], ['dancing', 'danced', 'describing', 'described'], ['dancing', 'danced', 'enhancing', 'enhanced']]


Evaluation 

In [60]:
def solve_analogy(a, b, c, embed_fn, vocab_dict):
    """
    Solve analogy: a : b :: c : ?
    using vector arithmetic and cosine similarity
    """

    # Retrieve embeddings
    va = np.asarray(embed_fn(a))
    vb = np.asarray(embed_fn(b))
    vc = np.asarray(embed_fn(c))

    # Vector arithmetic: vb - va + vc
    target_vec = vb - va + vc

    best_match = None
    highest_score = -1.0

    for candidate in vocab_dict.keys():
        # Skip original words
        if candidate in {a, b, c}:
            continue

        candidate_vec = np.asarray(embed_fn(candidate))
        score = cos_sim(target_vec, candidate_vec)

        if score > highest_score:
            highest_score = score
            best_match = candidate

    return best_match

In [61]:
def analogy_accuracy(analogy_list, embed_fn, vocab_dict):
    """
    Compute top-1 accuracy on a list of word analogies
    """

    correct_predictions = 0
    evaluated = 0

    for a, b, c, d in analogy_list:

        # Skip OOV cases
        if not all(word in vocab_dict for word in [a, b, c, d]):
            continue

        predicted = solve_analogy(a, b, c, embed_fn, vocab_dict)

        if predicted == d:
            correct_predictions += 1

        evaluated += 1

    if evaluated == 0:
        return 0.0

    return correct_predictions / evaluated

Syntactic Accuracy

In [62]:
print("Syntactic Accuracy\n" + "-" * 30)

models = [
    ("Skip-gram", get_embed_skip_gram),
    ("Skip-gram + Negative Sampling", get_embed_neg_sample),
    ("GloVe", get_embed_glove),
]

for model_name, embed_fn in models:
    acc = analogy_accuracy(past_tense_dataset, embed_fn, word2index)
    print(f"Syntactic Accuracy - {model_name}: {acc * 100:.2f}%")

Syntactic Accuracy
------------------------------
Syntactic Accuracy - Skip-gram: 0.00%
Syntactic Accuracy - Skip-gram + Negative Sampling: 0.00%
Syntactic Accuracy - GloVe: 0.00%


In [63]:
# Create gensim-compatible syntactic file
with open("gram7-past-tense.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

with open("gram7-past-tense_gensim.txt", "w", encoding="utf-8") as f:
    f.write(": gram7-past-tense\n")
    for line in lines:
        f.write(line)

gensim_acc = model_genism.evaluate_word_analogies(
    "gram7-past-tense_gensim.txt"
)[0]

print(f"Syntactic Accuracy - Gensim (pretrained): {gensim_acc * 100:.2f}%")

Syntactic Accuracy - Gensim (pretrained): 55.45%


Semantic Accuracy

In [64]:
print("Semantic Accuracy\n" + "-" * 30)

models = [
    ("Skip-gram", get_embed_skip_gram),
    ("Skip-gram + Negative Sampling", get_embed_neg_sample),
    ("GloVe", get_embed_glove),
]

for model_name, embed_fn in models:
    acc = analogy_accuracy(semantic_dataset, embed_fn, word2index)
    print(f"Semantic Accuracy - {model_name}: {acc * 100:.2f}%")

Semantic Accuracy
------------------------------
Semantic Accuracy - Skip-gram: 0.00%
Semantic Accuracy - Skip-gram + Negative Sampling: 0.00%
Semantic Accuracy - GloVe: 0.00%


In [65]:
# Create gensim-compatible semantic file
with open("capital-common-countries.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

with open("capital-common-countries_gensim.txt", "w", encoding="utf-8") as f:
    f.write(": capital-common-countries\n")
    for line in lines:
        f.write(line)

gensim_acc = model_genism.evaluate_word_analogies(
    "capital-common-countries_gensim.txt"
)[0]

print(f"Semantic Accuracy - Gensim (pretrained): {gensim_acc * 100:.2f}%")

Semantic Accuracy - Gensim (pretrained): 93.87%


| Model                    | Window Size | Training Loss (Epoch 5000) | Training Time (sec) | Syntactic Accuracy (%) | Semantic Accuracy (%) |
|--------------------------|-------------|-----------------------------|---------------------|------------------------|-----------------------|
| Skip-gram (Full Softmax) | 2           | 7.77                        | 0.162               | 0.0                    | 0.0                   |
| Skip-gram (Neg Sampling) | 2           | 10.85                       | 0.179               | 0.0                    | 0.0                   |
| GloVe                    | 2           | 0.13                        | 0.078               | 0.0                    | 0.0                   |
| GloVe (Gensim Pretrained)| –           | –                           | –                   | 55.45                  | 93.87                 |

<h1>Similarity Dataset<h1>

Loading dataset

In [66]:
import pandas as pd
# Define the column names
columns = ['Word 1', 'Word 2', 'Similarity Index']
df = pd.read_csv('wordsim_relatedness_goldstandard.txt', sep='\t', header=None, names=columns)

In [67]:
for index, row in df.iterrows():
    word_1 = row['Word 1']
    word_2 = row['Word 2']

    try:
        neg_samp_1_embed    = get_embed_neg_sample(word_1)
        neg_samp_2_embed    = get_embed_neg_sample(word_2)
        skip_gram_1_embed  = get_embed_skip_gram(word_1)
        skip_gram_2_embed    = get_embed_skip_gram(word_2)
        glove_1_embed       = get_embed_glove(word_1)
        glove_2_embed        = get_embed_glove(word_2)

    except KeyError:
        # Replacing missing embeddings with the embedding of '<UNK>'
        neg_samp_1_embed    = get_embed_neg_sample('<UNK>')
        neg_samp_2_embed    = get_embed_neg_sample('<UNK>')
        skip_gram_1_embed   = get_embed_skip_gram('<UNK>')
        skip_gram_2_embed   = get_embed_skip_gram('<UNK>')
        glove_1_embed       = get_embed_glove('<UNK>')
        glove_2_embed       = get_embed_glove('<UNK>')

    # Computing dot product
    df.at[index, 'dot_product_neg_samp'] = np.dot(neg_samp_1_embed, neg_samp_2_embed)
    df.at[index, 'dot_product_skip_gram'] = np.dot(skip_gram_1_embed, skip_gram_2_embed)
    df.at[index, 'dot_product_glove'] = np.dot(glove_1_embed, glove_2_embed)

In [68]:
df[:15]

Unnamed: 0,Word 1,Word 2,Similarity Index,dot_product_neg_samp,dot_product_skip_gram,dot_product_glove
0,computer,keyboard,7.62,35.665695,0.083752,1.889597
1,Jerusalem,Israel,8.46,35.665695,0.083752,1.889597
2,planet,galaxy,8.11,35.665695,0.083752,1.889597
3,canyon,landscape,7.53,35.665695,0.083752,1.889597
4,OPEC,country,5.63,-0.913162,0.189415,-0.052364
5,day,summer,3.94,-8.107746,0.664206,-0.390478
6,day,dawn,7.53,6.02797,0.254585,1.39096
7,country,citizen,7.31,-0.913162,0.189415,-0.052364
8,planet,people,5.75,1.996283,0.137011,-0.110307
9,environment,ecology,8.81,35.665695,0.083752,1.889597


In [69]:
from scipy.stats import spearmanr

# Spearman correlation 
sg_corr, _ = spearmanr(df['dot_product_skip_gram'], df['Similarity Index'])
neg_corr, _ = spearmanr(df['dot_product_neg_samp'], df['Similarity Index'])
glove_corr, _ = spearmanr(df['dot_product_glove'], df['Similarity Index'])

print(f"Spearman Correlation (Skip-gram): {sg_corr:.4f}")
print(f"Spearman Correlation (Neg Sampling): {neg_corr:.4f}")
print(f"Spearman Correlation (GloVe): {glove_corr:.4f}")

Spearman Correlation (Skip-gram): 0.0396
Spearman Correlation (Neg Sampling): 0.0448
Spearman Correlation (GloVe): 0.0744


In [70]:
correlation_coefficient = model_genism.evaluate_word_pairs('wordsim_relatedness_goldstandard.txt')
print(f"Spearman Correlation (Glove genism): {correlation_coefficient[1][0]:.2f}")

Spearman Correlation (Glove genism): 0.50


In [71]:
# Mean human similarity (required by assignment)
y_true = df['Similarity Index'].mean()
print(f"y_true: {y_true:.2f}")

y_true: 5.29


| Metric / Model              | Skip-gram | Skip-gram (Neg) | GloVe  | GloVe (Gensim) | y_true |
|-----------------------------|----------:|----------------:|-------:|---------------:|-------:|
| MSE  | 0.0396    | 0.0448          | 0.0744 | 0.50           | 5.29   |

In [72]:
def build_embedding_dict_skipgram_fullsoftmax(model, vocab, word2index, device):
    """
    For your Skip-gram (full softmax) model:
      - model.embedding_center
      - model.embedding_outside
    Saves averaged embedding = (v + u)/2 for each word.
    """
    model.eval()
    emb_dict = {}
    unk = word2index["<UNK>"]

    with torch.no_grad():
        for w in vocab:
            idx = word2index.get(w, unk)
            t = torch.LongTensor([idx]).to(device)

            v = model.embedding_center(t)   # [1, D]
            u = model.embedding_outside(t)  # [1, D]
            emb = (v + u) / 2.0             # [1, D]

            emb_dict[w] = emb.squeeze(0).cpu().numpy()

    return emb_dict


def build_embedding_dict_neg_sampling(model_neg, vocab, word2index, device):
    """
    For your Negative Sampling model:
      - model_neg.embedding_v
      - model_neg.embedding_u
    Saves averaged embedding = (v + u)/2 for each word.
    """
    model_neg.eval()
    emb_dict = {}
    unk = word2index["<UNK>"]

    with torch.no_grad():
        for w in vocab:
            idx = word2index.get(w, unk)
            t = torch.LongTensor([idx]).to(device)

            v = model_neg.embedding_v(t)  # [1, D]
            u = model_neg.embedding_u(t)  # [1, D]
            emb = (v + u) / 2.0           # [1, D]

            emb_dict[w] = emb.squeeze(0).cpu().numpy()

    return emb_dict


def build_embedding_dict_glove(model_glove, vocab, word2index, device):
    """
    For your GloVe model:
      - model_glove.embedding_v
      - model_glove.embedding_u
    Saves averaged embedding = (v + u)/2 for each word.
    """
    model_glove.eval()
    emb_dict = {}
    unk = word2index["<UNK>"]

    with torch.no_grad():
        for w in vocab:
            idx = word2index.get(w, unk)
            t = torch.LongTensor([idx]).to(device)

            v = model_glove.embedding_v(t)  # [1, D]
            u = model_glove.embedding_u(t)  # [1, D]
            emb = (v + u) / 2.0             # [1, D]

            emb_dict[w] = emb.squeeze(0).cpu().numpy()

    return emb_dict


# -------------------------
# Build embedding dictionaries (matches your variable names)
# -------------------------

embed_skipgram = build_embedding_dict_skipgram_fullsoftmax(model, vocab, word2index, device)
embed_neg      = build_embedding_dict_neg_sampling(model_neg, vocab, word2index, device)
embed_glove    = build_embedding_dict_glove(model_glove, vocab, word2index, device)

print("Built embedding dicts:",
      f"skipgram={len(embed_skipgram)}",
      f"neg={len(embed_neg)}",
      f"glove={len(embed_glove)}")

Built embedding dicts: skipgram=2947 neg=2947 glove=2947


In [73]:
import os

# -------------------------
# Save for your web app
# -------------------------

os.makedirs("model", exist_ok=True)

# Save gensim pretrained model (optional baseline)
with open("model/model_gensim.pkl", "wb") as f:
    pickle.dump(model_genism, f)

# Save embedding dictionaries (fast lookup in web app)
with open("model/embed_skipgram.pkl", "wb") as f:
    pickle.dump(embed_skipgram, f)

with open("model/embed_skipgram_neg.pkl", "wb") as f:
    pickle.dump(embed_neg, f)

with open("model/embed_glove.pkl", "wb") as f:
    pickle.dump(embed_glove, f)

print("Saved all embedding pickles to ./model/")

Saved all embedding pickles to ./model/


<h1>Conclusion<h1>

In this assignment, Skip-gram (full softmax), Skip-gram with negative sampling, and GloVe models were implemented and evaluated using the news category of the Brown corpus. Skip-gram with full softmax showed higher training loss and inconsistent training time due to the computational cost of computing a full softmax over the vocabulary, while negative sampling significantly improved efficiency by approximating this objective. GloVe achieved the lowest training loss and fastest training time per epoch due to its global co-occurrence–based optimization. However, all models trained from scratch achieved 0% accuracy on both syntactic and semantic analogy tasks, as well as very low correlation with human similarity judgments, mainly due to the limited corpus size and vocabulary coverage. In contrast, the pretrained GloVe model from Gensim performed substantially better, achieving high syntactic and semantic analogy accuracy and a strong Spearman correlation with human similarity scores. Overall, the results highlight that while negative sampling and GloVe improve computational efficiency, large-scale training data is essential for learning meaningful semantic representations, making pretrained embeddings more suitable for real-world NLP applications.


<h1>Screenshots of dash app<h1>

![Dash App Screenshot](images/dash_1.png)

![Dash App Screenshot](images/dash_2.png)