In [1]:
import sys, platform, time
import numpy as np
import torch

print("Python:", sys.version)
print("Platform:", platform.platform())

print("Torch:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Device:", device)

Python: 3.12.12 (main, Oct 28 2025, 11:52:25) [Clang 20.1.4 ]
Platform: macOS-26.2-arm64-arm-64bit
Torch: 2.9.1
MPS available: True
Device: mps


In [2]:
import nltk
print("NLTK data paths:")
for p in nltk.data.path:
    print(" -", p)

NLTK data paths:
 - /Users/thetsusann/nltk_data
 - /Users/thetsusann/Desktop/NLP/Assignment1/.venv/nltk_data
 - /Users/thetsusann/Desktop/NLP/Assignment1/.venv/share/nltk_data
 - /Users/thetsusann/Desktop/NLP/Assignment1/.venv/lib/nltk_data
 - /usr/share/nltk_data
 - /usr/local/share/nltk_data
 - /usr/lib/nltk_data
 - /usr/local/lib/nltk_data


In [3]:
import os, nltk

target_dir = nltk.data.path[0]
os.makedirs(target_dir, exist_ok=True)
print("Downloading into:", target_dir)

nltk.download("punkt", download_dir=target_dir)
nltk.download("punkt_tab", download_dir=target_dir)
nltk.download("reuters", download_dir=target_dir)

Downloading into: /Users/thetsusann/nltk_data


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/thetsusann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/thetsusann/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package reuters to
[nltk_data]     /Users/thetsusann/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [4]:
import nltk
from nltk.data import find

print("punkt:", find("tokenizers/punkt"))
print("punkt_tab:", find("tokenizers/punkt_tab/english/"))

punkt: /Users/thetsusann/nltk_data/tokenizers/punkt
punkt_tab: /Users/thetsusann/nltk_data/tokenizers/punkt_tab/english


In [5]:
from nltk.corpus import reuters
from nltk.tokenize import sent_tokenize, word_tokenize

fileids = reuters.fileids()
print("Total Reuters docs:", len(fileids))

SANITY_DOCS = 2000
subset_ids = fileids[:SANITY_DOCS]

sentences = []
for fid in subset_ids:
    raw = reuters.raw(fid)
    for sent in sent_tokenize(raw):
        tokens = [w.lower() for w in word_tokenize(sent)]
        tokens = [w for w in tokens if w.isalpha()]
        if len(tokens) >= 3:
            sentences.append(tokens)

print("Sanity sentences:", len(sentences))
print("Example sentence:", sentences[0][:20])

Total Reuters docs: 10788
Sanity sentences: 10114
Example sentence: ['asian', 'exporters', 'fear', 'damage', 'from', 'rift', 'mounting', 'trade', 'friction', 'between', 'the', 'and', 'japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'asia']


In [6]:
from collections import Counter

# Hyperparams for the sanity run (fast + stable)
MIN_COUNT = 5
MAX_VOCAB = 20000   # cap vocab to keep training fast

# Count words
word_counts = Counter(w for sent in sentences for w in sent)

# Keep words above MIN_COUNT
filtered = [(w, c) for w, c in word_counts.items() if c >= MIN_COUNT]
filtered.sort(key=lambda x: x[1], reverse=True)

# Cap vocab
filtered = filtered[:MAX_VOCAB]

# Build mappings (reserve 0 for UNK)
word2id = {"<UNK>": 0}
id2word = {0: "<UNK>"}

for i, (w, c) in enumerate(filtered, start=1):
    word2id[w] = i
    id2word[i] = w

vocab_size = len(word2id)
print("Raw unique words:", len(word_counts))
print("Vocab size (with <UNK>):", vocab_size)

# Encode sentences as IDs
corpus_ids = []
unk_count = 0
total_tokens = 0

for sent in sentences:
    ids = []
    for w in sent:
        total_tokens += 1
        wid = word2id.get(w, 0)
        if wid == 0:
            unk_count += 1
        ids.append(wid)
    corpus_ids.append(ids)

print("Total tokens:", total_tokens)
print("UNK tokens:", unk_count, f"({unk_count/total_tokens:.2%})")
print("Example encoded sentence:", corpus_ids[0][:20])

Raw unique words: 12324
Vocab size (with <UNK>): 4075
Total tokens: 238574
UNK tokens: 14365 (6.02%)
Example encoded sentence: [1892, 604, 2662, 939, 20, 0, 3190, 49, 2663, 142, 1, 6, 65, 29, 455, 1730, 686, 561, 2, 1731]


In [7]:
import numpy as np

WINDOW_SIZE = 2  # default required by assignment

skip_grams = []
for sent in corpus_ids:
    # skip sentences that are too short
    if len(sent) < 2 * WINDOW_SIZE + 1:
        continue
    for center_i in range(WINDOW_SIZE, len(sent) - WINDOW_SIZE):
        center = sent[center_i]
        # context within window (exclude center itself)
        for j in range(center_i - WINDOW_SIZE, center_i + WINDOW_SIZE + 1):
            if j == center_i:
                continue
            context = sent[j]
            skip_grams.append((center, context))

skip_grams = np.array(skip_grams, dtype=np.int64)
print("skip_grams shape:", skip_grams.shape)  # (num_pairs, 2)
print("Example pairs (center, context):", skip_grams[:5])


skip_grams shape: (792604, 2)
Example pairs (center, context): [[2662 1892]
 [2662  604]
 [2662  939]
 [2662   20]
 [ 939  604]]


In [8]:
import torch

BATCH_SIZE = 256

def get_batch(batch_size=BATCH_SIZE, device=device):
    idx = np.random.randint(0, len(skip_grams), size=batch_size)
    batch = skip_grams[idx]
    centers = torch.tensor(batch[:, 0], dtype=torch.long, device=device)
    contexts = torch.tensor(batch[:, 1], dtype=torch.long, device=device)
    return centers, contexts

# quick sanity check
c, x = get_batch()
print("centers:", c.shape, c.dtype, c.device)
print("contexts:", x.shape, x.dtype, x.device)
print("sample:", c[:5].tolist(), x[:5].tolist())

centers: torch.Size([256]) torch.int64 mps:0
contexts: torch.Size([256]) torch.int64 mps:0
sample: [1294, 0, 568, 3369, 40] [0, 31, 120, 0, 26]


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(42)
np.random.seed(42)

class SkipGramSoftmax(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.in_embed = nn.Embedding(vocab_size, emb_dim)
        self.out_linear = nn.Linear(emb_dim, vocab_size, bias=False)

    def forward(self, center_ids):
        v = self.in_embed(center_ids)         # [B, D]
        logits = self.out_linear(v)           # [B, V]
        return logits

EMB_DIM = 50
model_sg = SkipGramSoftmax(vocab_size=vocab_size, emb_dim=EMB_DIM).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_sg.parameters(), lr=0.01)

# Tiny sanity training: 200 steps
model_sg.train()
losses = []
for step in range(200):
    centers, contexts = get_batch()
    logits = model_sg(centers)               # [B, V]
    loss = criterion(logits, contexts)       # contexts: [B]
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    if (step + 1) % 50 == 0:
        print(f"step {step+1}/200 loss={loss.item():.4f}")

print("loss first/last:", losses[0], losses[-1])

step 50/200 loss=7.2239
step 100/200 loss=6.6175
step 150/200 loss=6.7080
step 200/200 loss=6.4783
loss first/last: 8.455802917480469 6.478254318237305


In [10]:
import time
import numpy as np
import torch

FINAL_STEPS = 3000   

model_sg.train()
t0 = time.time()

loss_log = []
for step in range(FINAL_STEPS):
    centers, contexts = get_batch()
    logits = model_sg(centers)
    loss = criterion(logits, contexts)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    loss_log.append(loss.item())
    if (step + 1) % 500 == 0:
        avg = float(np.mean(loss_log[-500:]))
        print(f"step {step+1}/{FINAL_STEPS} avg_loss(last500)={avg:.4f}")

t1 = time.time()
train_time = t1 - t0
final_avg_loss = float(np.mean(loss_log[-500:])) if len(loss_log) >= 500 else float(np.mean(loss_log))

print("\nSkip-gram Softmax DONE")
print("time_sec:", train_time)
print("final_avg_loss:", final_avg_loss)

# Save embeddings
sg_embeddings = model_sg.in_embed.weight.detach().to("cpu").numpy()
np.save("sg_softmax_embeddings.npy", sg_embeddings)
print("saved: sg_softmax_embeddings.npy", sg_embeddings.shape)


step 500/3000 avg_loss(last500)=6.1983
step 1000/3000 avg_loss(last500)=6.0316
step 1500/3000 avg_loss(last500)=5.9553
step 2000/3000 avg_loss(last500)=5.9014
step 2500/3000 avg_loss(last500)=5.8498
step 3000/3000 avg_loss(last500)=5.7925

Skip-gram Softmax DONE
time_sec: 4.2142112255096436
final_avg_loss: 5.79247337436676
saved: sg_softmax_embeddings.npy (4075, 50)


In [11]:
import numpy as np

# Build unigram distribution with 0.75 power (standard for word2vec NEG)
power = 0.75

# word_counts includes counts for ALL raw words, but NEG should use your final vocab
# Let's reconstruct counts aligned to vocab IDs
id_counts = np.zeros(vocab_size, dtype=np.int64)

# <UNK> count = total unk occurrences
id_counts[0] = 0  # keep <UNK> out of negative sampling (common choice)

for w, wid in word2id.items():
    if wid == 0:
        continue
    id_counts[wid] = word_counts[w]

# Probabilities
p = id_counts.astype(np.float64) ** power
p_sum = p.sum()
p = p / p_sum

# Unigram table (controls speed/quality tradeoff)
TABLE_SIZE = 200000  # good for our small vocab; bigger = smoother but more memory
unigram_table = np.random.choice(np.arange(vocab_size), size=TABLE_SIZE, p=p)

print("unigram_table size:", unigram_table.shape)
print("table id min/max:", unigram_table.min(), unigram_table.max())
print("contains UNK (0)?", (unigram_table == 0).any())

unigram_table size: (200000,)
table id min/max: 1 4074
contains UNK (0)? False


In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

torch.manual_seed(42)
np.random.seed(42)

NEG_K = 5
EMB_DIM = 50
BATCH_SIZE = 256

class SkipGramNEG(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.in_embed = nn.Embedding(vocab_size, emb_dim)
        self.out_embed = nn.Embedding(vocab_size, emb_dim)

    def forward(self, center_ids, pos_context_ids, neg_context_ids):
        # center_ids: [B]
        # pos_context_ids: [B]
        # neg_context_ids: [B, K]

        v = self.in_embed(center_ids)                 # [B, D]
        u_pos = self.out_embed(pos_context_ids)       # [B, D]
        u_neg = self.out_embed(neg_context_ids)       # [B, K, D]

        # positive loss: -log sigma(u_pos · v)
        pos_score = torch.sum(u_pos * v, dim=1)       # [B]
        pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-9)

        # negative loss: -sum log sigma(-u_neg · v)
        neg_score = torch.bmm(u_neg, v.unsqueeze(2)).squeeze(2)  # [B, K]
        neg_loss = -torch.sum(torch.log(torch.sigmoid(-neg_score) + 1e-9), dim=1)

        return (pos_loss + neg_loss).mean()

model_neg = SkipGramNEG(vocab_size=vocab_size, emb_dim=EMB_DIM).to(device)
optimizer_neg = optim.Adam(model_neg.parameters(), lr=0.01)

def get_batch_neg(batch_size=BATCH_SIZE, k=NEG_K, device=device):
    idx = np.random.randint(0, len(skip_grams), size=batch_size)
    batch = skip_grams[idx]
    centers = torch.tensor(batch[:, 0], dtype=torch.long, device=device)
    pos = torch.tensor(batch[:, 1], dtype=torch.long, device=device)

    # sample negatives from unigram table (fast)
    neg_np = unigram_table[np.random.randint(0, len(unigram_table), size=(batch_size, k))]
    # ensure neg != pos (rare, but fix)
    pos_np = batch[:, 1]
    mask = (neg_np == pos_np[:, None])
    while mask.any():
        neg_np[mask] = unigram_table[np.random.randint(0, len(unigram_table), size=mask.sum())]
        mask = (neg_np == pos_np[:, None])

    neg = torch.tensor(neg_np, dtype=torch.long, device=device)
    return centers, pos, neg

# Tiny sanity training: 200 steps
model_neg.train()
losses = []
for step in range(200):
    centers, pos, neg = get_batch_neg()
    loss = model_neg(centers, pos, neg)

    optimizer_neg.zero_grad()
    loss.backward()
    optimizer_neg.step()

    losses.append(loss.item())
    if (step + 1) % 50 == 0:
        print(f"step {step+1}/200 loss={loss.item():.4f}")

print("loss first/last:", losses[0], losses[-1])

step 50/200 loss=14.3470
step 100/200 loss=11.6801
step 150/200 loss=10.4369
step 200/200 loss=9.7720
loss first/last: 18.145038604736328 9.77197265625


In [13]:
import time
import numpy as np

FINAL_STEPS_NEG = 6000   # NEG is cheaper; this is still quick. If needed, drop to 3000.

model_neg.train()
t0 = time.time()

loss_log = []
for step in range(FINAL_STEPS_NEG):
    centers, pos, neg = get_batch_neg()
    loss = model_neg(centers, pos, neg)

    optimizer_neg.zero_grad()
    loss.backward()
    optimizer_neg.step()

    loss_log.append(loss.item())
    if (step + 1) % 1000 == 0:
        avg = float(np.mean(loss_log[-1000:]))
        print(f"step {step+1}/{FINAL_STEPS_NEG} avg_loss(last1000)={avg:.4f}")

t1 = time.time()
train_time_neg = t1 - t0
final_avg_loss_neg = float(np.mean(loss_log[-1000:])) if len(loss_log) >= 1000 else float(np.mean(loss_log))

print("\nSkip-gram NEG DONE")
print("time_sec:", train_time_neg)
print("final_avg_loss:", final_avg_loss_neg)

neg_embeddings = model_neg.in_embed.weight.detach().to("cpu").numpy()
np.save("sg_neg_embeddings.npy", neg_embeddings)
print("saved: sg_neg_embeddings.npy", neg_embeddings.shape)

step 1000/6000 avg_loss(last1000)=4.8793
step 2000/6000 avg_loss(last1000)=2.7656
step 3000/6000 avg_loss(last1000)=2.4342
step 4000/6000 avg_loss(last1000)=2.2937
step 5000/6000 avg_loss(last1000)=2.2100
step 6000/6000 avg_loss(last1000)=2.1561

Skip-gram NEG DONE
time_sec: 8.173359870910645
final_avg_loss: 2.156061161994934
saved: sg_neg_embeddings.npy (4075, 50)


In [None]:
from collections import defaultdict
import time
import numpy as np

GLOVE_WINDOW = 2  # keep consistent with earlier window unless your prof specifies otherwise

t0 = time.time()
cooc = defaultdict(float)

for si, sent in enumerate(corpus_ids):
    n = len(sent)
    for i, wi in enumerate(sent):
        # context window
        start = max(0, i - GLOVE_WINDOW)
        end = min(n, i + GLOVE_WINDOW + 1)
        for j in range(start, end):
            if j == i:
                continue
            wj = sent[j]
            dist = abs(j - i)
            # standard GloVe weighting: 1/dist
            cooc[(wi, wj)] += 1.0 / dist

    if (si + 1) % 1000 == 0:
        print(f"processed sentences: {si+1}/{len(corpus_ids)}; cooc_pairs={len(cooc)}")

t1 = time.time()
print("\nCo-occurrence build DONE")
print("num_pairs:", len(cooc))
print("time_sec:", t1 - t0)

# peek a few entries
sample_items = list(cooc.items())[:5]
print("sample:", sample_items)

processed sentences: 1000/10114; cooc_pairs=45976
processed sentences: 2000/10114; cooc_pairs=77913
processed sentences: 3000/10114; cooc_pairs=102998
processed sentences: 4000/10114; cooc_pairs=129811
processed sentences: 5000/10114; cooc_pairs=150534
processed sentences: 6000/10114; cooc_pairs=171992
processed sentences: 7000/10114; cooc_pairs=195341
processed sentences: 8000/10114; cooc_pairs=217723
processed sentences: 9000/10114; cooc_pairs=237351
processed sentences: 10000/10114; cooc_pairs=257348

Co-occurrence build DONE
num_pairs: 259866
time_sec: 0.22613811492919922
sample: [((1892, 604), 1.0), ((1892, 2662), 0.5), ((604, 1892), 1.0), ((604, 2662), 1.0), ((604, 939), 0.5)]


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time
import math

# Convert cooc dict to arrays (so training loop is fast)
pairs = np.array(list(cooc.keys()), dtype=np.int64)        # [N, 2]
counts = np.array(list(cooc.values()), dtype=np.float32)   # [N]
N = len(counts)
print("Training pairs N:", N)

# GloVe hyperparams (safe defaults)
EMB_DIM = 50
X_MAX = 100.0
ALPHA = 0.75
BATCH_SIZE_G = 2048
EPOCHS_G = 5
LR_G = 0.05

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

class GloVe(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.w = nn.Embedding(vocab_size, emb_dim)
        self.w_tilde = nn.Embedding(vocab_size, emb_dim)
        self.b = nn.Embedding(vocab_size, 1)
        self.b_tilde = nn.Embedding(vocab_size, 1)

    def forward(self, i_ids, j_ids):
        wi = self.w(i_ids)                  # [B, D]
        wj = self.w_tilde(j_ids)            # [B, D]
        bi = self.b(i_ids).squeeze(1)       # [B]
        bj = self.b_tilde(j_ids).squeeze(1) # [B]
        return (wi * wj).sum(dim=1) + bi + bj  # [B]

glove = GloVe(vocab_size=vocab_size, emb_dim=EMB_DIM).to(device)
opt = optim.Adagrad(glove.parameters(), lr=LR_G)

# Precompute weights f(X)
fx = np.minimum((counts / X_MAX) ** ALPHA, 1.0).astype(np.float32)
logx = np.log(counts + 1e-8).astype(np.float32)

pairs_t = torch.tensor(pairs, dtype=torch.long, device=device)
fx_t = torch.tensor(fx, dtype=torch.float32, device=device)
logx_t = torch.tensor(logx, dtype=torch.float32, device=device)

t0 = time.time()
glove.train()

for epoch in range(EPOCHS_G):
    perm = torch.randperm(N, device=device)
    total_loss = 0.0
    num_batches = 0

    for start in range(0, N, BATCH_SIZE_G):
        idx = perm[start:start+BATCH_SIZE_G]
        ij = pairs_t[idx]
        i_ids = ij[:, 0]
        j_ids = ij[:, 1]

        pred = glove(i_ids, j_ids)                 # [B]
        diff = pred - logx_t[idx]                  # [B]
        loss = (fx_t[idx] * diff * diff).mean()

        opt.zero_grad()
        loss.backward()
        opt.step()

        total_loss += float(loss.item())
        num_batches += 1

    avg_loss = total_loss / max(1, num_batches)
    print(f"epoch {epoch+1}/{EPOCHS_G} avg_loss={avg_loss:.4f}")

t1 = time.time()
train_time_glove = t1 - t0
print("\nGloVe DONE")
print("time_sec:", train_time_glove)

# Common final embedding = w + w_tilde
glove_embeddings = (glove.w.weight.detach() + glove.w_tilde.weight.detach()).to("cpu").numpy()
np.save("glove_embeddings.npy", glove_embeddings)
print("saved: glove_embeddings.npy", glove_embeddings.shape)

Training pairs N: 259866
epoch 1/5 avg_loss=1.9079
epoch 2/5 avg_loss=1.1289
epoch 3/5 avg_loss=0.8688
epoch 4/5 avg_loss=0.7129
epoch 5/5 avg_loss=0.6059

GloVe DONE
time_sec: 1.7182388305664062
saved: glove_embeddings.npy (4075, 50)


In [16]:
import numpy as np

# Load embeddings you already saved
E_softmax = np.load("sg_softmax_embeddings.npy")
E_neg = np.load("sg_neg_embeddings.npy")
E_glove = np.load("glove_embeddings.npy")

print("Loaded shapes:", E_softmax.shape, E_neg.shape, E_glove.shape)

def normalize_rows(M, eps=1e-9):
    norms = np.linalg.norm(M, axis=1, keepdims=True)
    return M / (norms + eps)

# normalized versions for fast cosine
E_softmax_n = normalize_rows(E_softmax)
E_neg_n = normalize_rows(E_neg)
E_glove_n = normalize_rows(E_glove)

def analogy_accuracy(questions, E_n, word2id, topk=1):
    """
    questions: list of (a,b,c,d) where a:b :: c:d
    Return accuracy@topk over questions that are all in vocab.
    """
    correct = 0
    used = 0
    for a,b,c,d in questions:
        if a not in word2id or b not in word2id or c not in word2id or d not in word2id:
            continue
        ia, ib, ic, id_ = word2id[a], word2id[b], word2id[c], word2id[d]
        # v = b - a + c
        v = E_n[ib] - E_n[ia] + E_n[ic]
        v = v / (np.linalg.norm(v) + 1e-9)

        sims = E_n @ v  # cosine because normalized
        # exclude input words
        sims[[ia, ib, ic]] = -1e9
        # topk predictions
        pred_ids = np.argpartition(-sims, topk)[:topk]
        if id_ in pred_ids:
            correct += 1
        used += 1
    acc = correct / used if used > 0 else 0.0
    return acc, used


Loaded shapes: (4075, 50) (4075, 50) (4075, 50)


In [19]:
import gensim
print("gensim version:", gensim.__version__)

gensim version: 4.4.0


In [22]:
from gensim.test.utils import datapath
import pandas as pd

ws_path = datapath("wordsim353.tsv")
df_ws = pd.read_csv(ws_path, sep="\t")

print("WordSim353 path:", ws_path)
print("Columns:", list(df_ws.columns))
print(df_ws.head())

WordSim353 path: /Users/thetsusann/Desktop/NLP/Assignment1/.venv/lib/python3.12/site-packages/gensim/test/test_data/wordsim353.tsv
Columns: ['# The WordSimilarity-353 Test Collection (http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/)']
                # The WordSimilarity-353 Test Collection (http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/)
# Word 1 Word 2                                       Human (mean)                                                      
love     sex                                                  6.77                                                      
tiger    cat                                                  7.35                                                      
         tiger                                               10.00                                                      
book     paper                                                7.46                                                      


In [23]:
from gensim.test.utils import datapath

ws_path = datapath("wordsim353.tsv")

wordsim_pairs = []
with open(ws_path, "r") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        parts = line.split()
        if len(parts) != 3:
            continue
        w1, w2, score = parts
        wordsim_pairs.append((w1.lower(), w2.lower(), float(score)))

print("WordSim pairs ready:", len(wordsim_pairs))
print("Sample:", wordsim_pairs[:5])

WordSim pairs ready: 353
Sample: [('love', 'sex', 6.77), ('tiger', 'cat', 7.35), ('tiger', 'tiger', 10.0), ('book', 'paper', 7.46), ('computer', 'keyboard', 7.62)]


In [25]:
from scipy.stats import spearmanr
import numpy as np

def wordsim_spearman(wordsim_pairs, E_n, word2id):
    human_scores = []
    model_scores = []

    for w1, w2, score in wordsim_pairs:
        if w1 not in word2id or w2 not in word2id:
            continue
        i, j = word2id[w1], word2id[w2]
        sim = float(np.dot(E_n[i], E_n[j]))  # cosine because normalized
        human_scores.append(score)
        model_scores.append(sim)

    corr, _ = spearmanr(human_scores, model_scores)
    return corr, len(human_scores)

# Compute for all 3 models
ws_softmax, n1 = wordsim_spearman(wordsim_pairs, E_softmax_n, word2id)
ws_neg, n2     = wordsim_spearman(wordsim_pairs, E_neg_n, word2id)
ws_glove, n3   = wordsim_spearman(wordsim_pairs, E_glove_n, word2id)

print("WordSim353 Spearman:")
print(f"Skip-gram Softmax: {ws_softmax:.4f} (pairs used={n1})")
print(f"Skip-gram NEG    : {ws_neg:.4f} (pairs used={n2})")
print(f"GloVe            : {ws_glove:.4f} (pairs used={n3})")

WordSim353 Spearman:
Skip-gram Softmax: 0.1183 (pairs used=105)
Skip-gram NEG    : 0.0794 (pairs used=105)
GloVe            : -0.0521 (pairs used=105)


In [26]:
# Load analogy questions
questions = []
with open(analogy_path, "r") as f:
    for line in f:
        line = line.strip().lower()
        if not line or line.startswith(":"):
            continue
        a, b, c, d = line.split()
        questions.append((a, b, c, d))

print("Total analogy questions:", len(questions))

# Evaluate
acc_softmax, used1 = analogy_accuracy(questions, E_softmax_n, word2id)
acc_neg, used2     = analogy_accuracy(questions, E_neg_n, word2id)
acc_glove, used3   = analogy_accuracy(questions, E_glove_n, word2id)

print("Google Analogy Accuracy:")
print(f"Skip-gram Softmax: {acc_softmax:.4f} (used={used1})")
print(f"Skip-gram NEG    : {acc_neg:.4f} (used={used2})")
print(f"GloVe            : {acc_glove:.4f} (used={used3})")

Total analogy questions: 19544
Google Analogy Accuracy:
Skip-gram Softmax: 0.0020 (used=1005)
Skip-gram NEG    : 0.0020 (used=1005)
GloVe            : 0.0000 (used=1005)


In [27]:
import re, json
import numpy as np
from nltk.corpus import reuters

# Choose which embeddings to use for the web app
# Recommended: NEG
E = E_neg  # change to E_softmax or E_glove if you want

# Normalize for cosine similarity later
def normalize_rows(M, eps=1e-9):
    norms = np.linalg.norm(M, axis=1, keepdims=True)
    return M / (norms + eps)

E_n = normalize_rows(E)

def text_to_ids(text, word2id):
    # simple tokenizer consistent with earlier: alphabetic lowercase
    tokens = re.findall(r"[a-zA-Z]+", text.lower())
    return [word2id.get(t, 0) for t in tokens]  # 0 = <UNK>

def paragraph_vector(text, E, word2id):
    ids = text_to_ids(text, word2id)
    ids = [i for i in ids if i != 0]  # drop UNK for paragraph vector
    if len(ids) == 0:
        return None
    return E[ids].mean(axis=0)

# Build paragraph list from Reuters
# Keep it modest to stay fast and keep web app snappy.
MAX_DOCS = 2000   # increase later if you want
MIN_CHARS = 80    # ignore tiny paragraphs

fileids = reuters.fileids()[:MAX_DOCS]

paragraphs = []
for fid in fileids:
    raw = reuters.raw(fid)
    # split on blank lines; Reuters often has short lines, so filter by length
    for p in re.split(r"\n\s*\n", raw):
        p = p.strip()
        if len(p) >= MIN_CHARS:
            paragraphs.append(p)

print("Raw paragraphs:", len(paragraphs))

# Compute vectors
vecs = []
texts = []
for p in paragraphs:
    v = paragraph_vector(p, E, word2id)
    if v is None:
        continue
    vecs.append(v.astype(np.float32))
    texts.append(p)

V = np.vstack(vecs)  # [N, D]
V_n = normalize_rows(V)

print("Kept paragraphs:", len(texts))
print("Paragraph vectors shape:", V_n.shape)

# Save
np.save("paragraph_vectors.npy", V_n)
with open("paragraph_texts.json", "w") as f:
    json.dump(texts, f)

print("Saved: paragraph_vectors.npy and paragraph_texts.json")
print("Example paragraph:", texts[0][:200], "...")


Raw paragraphs: 1998
Kept paragraphs: 1998
Paragraph vectors shape: (1998, 50)
Saved: paragraph_vectors.npy and paragraph_texts.json
Example paragraph: ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reachin ...


In [28]:
import json

with open("word2id.json", "w") as f:
    json.dump(word2id, f)

print("Saved: word2id.json (size:", len(word2id), ")")

Saved: word2id.json (size: 4075 )
