# Word2Vec (Negative Sampling)

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import nltk
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import brown
import time

In [2]:
np.__version__, torch.__version__

('2.0.2', '2.8.0+cpu')

In [3]:
import matplotlib
matplotlib.__version__

'3.9.4'

## 1. Load data

# Load the Brown corpus as a real-world dataset.
# The preprocessing steps (tokenization and lowercasing)
# are kept consistent with the basic Skip-gram model
# to allow fair comparison between models.


In [4]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\aashu\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [None]:
corpus = []
sentences = brown.sents(categories="news")
sentences = [[word.lower() for word in sent] for sent in sentences] #changing all words to lower case
corpus = [word for sent in sentences for word in sent] #flattening the list of sentences to a list of words

In [6]:
corpus[0][0]

't'

In [7]:
#1. tokenization
corpus

['the',
 'fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of',
 "atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.',
 'the',
 'jury',
 'further',
 'said',
 'in',
 'term-end',
 'presentments',
 'that',
 'the',
 'city',
 'executive',
 'committee',
 ',',
 'which',
 'had',
 'over-all',
 'charge',
 'of',
 'the',
 'election',
 ',',
 '``',
 'deserves',
 'the',
 'praise',
 'and',
 'thanks',
 'of',
 'the',
 'city',
 'of',
 'atlanta',
 "''",
 'for',
 'the',
 'manner',
 'in',
 'which',
 'the',
 'election',
 'was',
 'conducted',
 '.',
 'the',
 'september-october',
 'term',
 'jury',
 'had',
 'been',
 'charged',
 'by',
 'fulton',
 'superior',
 'court',
 'judge',
 'durwood',
 'pye',
 'to',
 'investigate',
 'reports',
 'of',
 'possible',
 '``',
 'irregularities',
 "''",
 'in',
 'the',
 'hard-fought',
 'primary',
 'which',
 'was',
 'won',
 'by',
 'mayor-nominate'

In [8]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(corpus)) #all the words we have in the system - <UNK>

In [9]:
vocabs

['reno-lake',
 'cards',
 'glaze',
 'request',
 'blast',
 'mill',
 'thus',
 'stimulation',
 '31st',
 'prominently',
 'intelligent',
 'begun',
 'consultation',
 'harbor',
 '5777',
 '228-229',
 'retrospect',
 'voter',
 'reply',
 'seven-iron',
 'quarreling',
 "china's",
 "council's",
 'bondsman',
 'schaefer',
 'fielders',
 'wave',
 'store',
 'negotiations',
 'remarks',
 'rouge',
 'contested',
 'draper',
 'hasty',
 'hitter',
 'scenic',
 'machinist',
 'cemetery',
 'oak',
 'prepayment',
 'clinic',
 'anti-monopoly',
 'purposely',
 'oats',
 'indorsed',
 'harder',
 'sorrentino',
 '4-homer',
 'proceedings',
 'vague',
 'non-farm',
 'fuller',
 'sixth',
 'ruddy',
 'cen-tennial',
 'outside',
 'accidentally',
 'golf',
 'christian',
 'joyce',
 'remote',
 'tank',
 'clements',
 'stereotype',
 'nichols',
 'saute',
 'rationale',
 'stab',
 '1959-60',
 '2:30',
 'caldwell',
 'luncheon-table',
 'chien',
 'untrammeled',
 'reduced',
 'culture',
 "nato's",
 'strikes',
 'stating',
 'versed',
 'displaying',
 'adds'

In [10]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['asian']

2479

In [None]:
vocabs.append('<UNK>')
word2index['<UNK>'] = len(vocabs) - 1 

In [None]:
index2word = {v:k for k, v in word2index.items()} 
index2word[5]

'mill'

## 2. Prepare train data

In [13]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, window_size=2):
    if window_size < 1:
        raise ValueError("window_size must be >= 1")

    corpus_len = len(corpus)
    if corpus_len <= 2 * window_size:
        raise ValueError("corpus too small for given window_size")

    # valid center indices that have full window on both sides
    low = window_size
    high = corpus_len - window_size

    centers = np.random.randint(low, high, size=batch_size)
    inputs, labels = [], []

    for i in centers:
        center_word = corpus[i]
        # choose a random outside within the window (excluding center)
        offsets = list(range(-window_size, 0)) + list(range(1, window_size + 1))
        offset = np.random.choice(offsets)
        outside_word = corpus[i + offset]

        c_idx = word2index.get(center_word, word2index['<UNK>'])
        o_idx = word2index.get(outside_word, word2index['<UNK>'])

        inputs.append([c_idx])
        labels.append([o_idx])

    return np.array(inputs, dtype=np.int64), np.array(labels, dtype=np.int64)
            
x, y = random_batch(2, corpus)

In [14]:
x.shape  #batch_size, 1

(2, 1)

In [15]:
x

array([[2892],
       [1931]])

In [16]:
y.shape  #batch_size 1

(2, 1)

## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [17]:
z = 0.001

In [18]:
#count
from collections import Counter

word_count = Counter(corpus)
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words


100554

In [19]:
vocabs

['reno-lake',
 'cards',
 'glaze',
 'request',
 'blast',
 'mill',
 'thus',
 'stimulation',
 '31st',
 'prominently',
 'intelligent',
 'begun',
 'consultation',
 'harbor',
 '5777',
 '228-229',
 'retrospect',
 'voter',
 'reply',
 'seven-iron',
 'quarreling',
 "china's",
 "council's",
 'bondsman',
 'schaefer',
 'fielders',
 'wave',
 'store',
 'negotiations',
 'remarks',
 'rouge',
 'contested',
 'draper',
 'hasty',
 'hitter',
 'scenic',
 'machinist',
 'cemetery',
 'oak',
 'prepayment',
 'clinic',
 'anti-monopoly',
 'purposely',
 'oats',
 'indorsed',
 'harder',
 'sorrentino',
 '4-homer',
 'proceedings',
 'vague',
 'non-farm',
 'fuller',
 'sixth',
 'ruddy',
 'cen-tennial',
 'outside',
 'accidentally',
 'golf',
 'christian',
 'joyce',
 'remote',
 'tank',
 'clements',
 'stereotype',
 'nichols',
 'saute',
 'rationale',
 'stab',
 '1959-60',
 '2:30',
 'caldwell',
 'luncheon-table',
 'chien',
 'untrammeled',
 'reduced',
 'culture',
 "nato's",
 'strikes',
 'stating',
 'versed',
 'displaying',
 'adds'

$$P(w)=U(w)^{3/4}/Z$$

In [20]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({'the': 126,
         ',': 108,
         '.': 89,
         'of': 69,
         'and': 56,
         'a': 55,
         'to': 55,
         'in': 53,
         'for': 30,
         'that': 27,
         'was': 24,
         'is': 24,
         "''": 24,
         '``': 24,
         'on': 23,
         'at': 22,
         'he': 22,
         'with': 20,
         'as': 19,
         'be': 19,
         'it': 18,
         'by': 18,
         'said': 16,
         'his': 16,
         'will': 15,
         'from': 14,
         'this': 13,
         'an': 13,
         'are': 13,
         ';': 13,
         'had': 12,
         'has': 12,
         '--': 12,
         'but': 12,
         'were': 11,
         'not': 11,
         'they': 11,
         'who': 11,
         'would': 11,
         'mrs.': 11,
         'have': 11,
         'which': 10,
         'their': 10,
         'new': 10,
         'been': 9,
         'there': 9,
         'one': 9,
         'when': 8,
         'two': 8,
         'or': 8,
        

## 4. Model

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [None]:
def prepare_sequence(seq, word2index): #convert list of words to list of indices
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return torch.LongTensor(idxs)

In [None]:
import random

def negative_sampling(targets, unigram_table, k): 
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [None]:
batch_size = 8 #number of samples in one batch
x, y = random_batch(batch_size, corpus)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)

In [24]:
k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [25]:
y_tensor[1]

tensor([6870])

In [26]:
neg_samples[1]

tensor([ 8101,  4257, 11026, 10957,  2061])

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [None]:
class SkipgramNeg(nn.Module): #Skip-gram with Negative Sampling
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

In [28]:
#test your model
emb_size = 8
voc_size = len(vocabs)
model = SkipgramNeg(voc_size, emb_size)

In [29]:
loss = model(x_tensor, y_tensor, neg_samples)

In [30]:
loss

tensor(3.7916, grad_fn=<NegBackward0>)

## 5. Training

In [31]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [32]:
losses = []
start_time = time.time()

num_epochs = 1000

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss = model(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    losses.append(loss.item())
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

end_time = time.time()
training_time = end_time - start_time




Epoch   1000 | Loss: 2.684480


In [33]:
print("Final loss:", losses[-1])
print("Training time (s):", training_time)

Final loss: 2.6844801902770996
Training time (s): 4.1034016609191895


## 6. Plot the embeddings

In [34]:
vocabs

['reno-lake',
 'cards',
 'glaze',
 'request',
 'blast',
 'mill',
 'thus',
 'stimulation',
 '31st',
 'prominently',
 'intelligent',
 'begun',
 'consultation',
 'harbor',
 '5777',
 '228-229',
 'retrospect',
 'voter',
 'reply',
 'seven-iron',
 'quarreling',
 "china's",
 "council's",
 'bondsman',
 'schaefer',
 'fielders',
 'wave',
 'store',
 'negotiations',
 'remarks',
 'rouge',
 'contested',
 'draper',
 'hasty',
 'hitter',
 'scenic',
 'machinist',
 'cemetery',
 'oak',
 'prepayment',
 'clinic',
 'anti-monopoly',
 'purposely',
 'oats',
 'indorsed',
 'harder',
 'sorrentino',
 '4-homer',
 'proceedings',
 'vague',
 'non-farm',
 'fuller',
 'sixth',
 'ruddy',
 'cen-tennial',
 'outside',
 'accidentally',
 'golf',
 'christian',
 'joyce',
 'remote',
 'tank',
 'clements',
 'stereotype',
 'nichols',
 'saute',
 'rationale',
 'stab',
 '1959-60',
 '2:30',
 'caldwell',
 'luncheon-table',
 'chien',
 'untrammeled',
 'reduced',
 'culture',
 "nato's",
 'strikes',
 'stating',
 'versed',
 'displaying',
 'adds'

In [36]:
def get_embed(word):
    try:
        index = word2index[word]
    except:
        index = word2index['<UNK>']
        
    word = torch.LongTensor([word2index[word]])
    
    embed_c = model.embedding_center(word)
    embed_o = model.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2
    
    return embed[0][0].item(), embed[0][1].item()

In [37]:
get_embed('fruit')

(0.03596661239862442, -1.217374324798584)

In [39]:
get_embed('dog')

(-0.22010044753551483, -0.07657714188098907)

In [None]:
# plt.figure(figsize=(6, 3))
# for i, word in enumerate(vocabs):
#     x, y = get_embed(word)
#     plt.scatter(x, y)
#     plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points')
# plt.show()

## 7. Experiments


In [43]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity


In [44]:
# def cosine_similarity(a, b):
#     return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


# Step 1: build final embedding matrix
W = (model.embedding_center.weight.detach().cpu().numpy() +
     model.embedding_outside.weight.detach().cpu().numpy()) / 2

# Normalize embeddings for fast cosine similarity
W_norm = W / np.linalg.norm(W, axis=1, keepdims=True)



def get_vector(word):
    if word not in word2index:
        return None
    return W_norm[word2index[word]] 


In [46]:
index2word = {idx: word for word, idx in word2index.items()}
def predict_analogy(a, b, c, W_norm, word2index, index2word):
    if a not in word2index or b not in word2index or c not in word2index:
        return None

    va = W_norm[word2index[a]]
    vb = W_norm[word2index[b]]
    vc = W_norm[word2index[c]]

    # Vector arithmetic: b - a + c
    target = vb - va + vc
    target = target / np.linalg.norm(target)

    # Cosine similarity with ALL words at once
    similarities = np.dot(W_norm, target)

    # Exclude input words
    for w in (a, b, c):
        similarities[word2index[w]] = -1

    best_index = np.argmax(similarities)
    return index2word[best_index]



In [47]:
predict_analogy("banana", "fruit", "apple", W, word2index, index2word)


In [48]:

def evaluate_analogies(file_path, W_norm, word2index, index2word):
    total = 0
    correct = 0

    with open(file_path, "r") as f:
        for line in f:
            words = line.strip().split()
            if len(words) != 4:
                continue

            a, b, c, d = words

            prediction = predict_analogy(
                a, b, c, W_norm, word2index, index2word
            )

            total += 1
            if prediction == d:
                correct += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy, correct, total


Semantic and Syntatic Test

In [50]:
semantic_acc, sem_correct, sem_total = evaluate_analogies(
    "country-capital.txt",
    W,
    word2index,
    index2word
)


In [51]:
syntactic_acc, syn_correct, syn_total = evaluate_analogies(
    "past-tense.txt",
    W,
    word2index,
    index2word
)

print(f"Semantic accuracy (capital-common-countries): {semantic_acc:.4f} ({sem_correct}/{sem_total})")
print(f"Syntactic accuracy (past-tense): {syntactic_acc:.4f} ({syn_correct}/{syn_total})")


Semantic accuracy (capital-common-countries): 0.0000 (0/5030)
Syntactic accuracy (past-tense): 0.0000 (0/1560)


In [52]:
import pandas as pd

similarity_df = pd.read_csv("combined.csv")
similarity_df.head()


Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.0
3,book,paper,7.46
4,computer,keyboard,7.62


In [53]:
model_scores = []
human_scores = []
skipped = 0

for _, row in similarity_df.iterrows():
    w1 = row["Word 1"]
    w2 = row["Word 2"]
    human_score = row["Human (mean)"]

    if w1 not in word2index or w2 not in word2index:
        skipped += 1
        continue

    v1 = W_norm[word2index[w1]]
    v2 = W_norm[word2index[w2]]

    model_sim = np.dot(v1, v2)  # cosine similarity

    model_scores.append(model_sim)
    human_scores.append(human_score)

print(f"Used pairs: {len(model_scores)}")
print(f"Skipped OOV pairs: {skipped}")


Used pairs: 197
Skipped OOV pairs: 156


Spearman SImilarity Test


In [54]:

from scipy.stats import spearmanr

correlation, p_value = spearmanr(model_scores, human_scores)

print(f"Spearman correlation: {correlation:.4f}")
print(f"P-value: {p_value:.4e}")


Spearman correlation: 0.0444
P-value: 5.3529e-01


In [55]:
print("Example comparisons:")
for i in range(5):
    print(
        similarity_df.iloc[i, 0],
        similarity_df.iloc[i, 1],
        "Human:", human_scores[i],
        "Model:", round(model_scores[i], 3)
    )


Example comparisons:
love sex Human: 6.77 Model: -0.106
tiger cat Human: 7.46 Model: 0.125
tiger tiger Human: 5.77 Model: -0.092
book paper Human: 6.31 Model: -0.536
computer keyboard Human: 7.5 Model: -0.028


In [73]:
import pandas as pd

results = {
    "Model": ["Skipgram (NEG)"],
    "Window Size": ["2"],
    "Training Loss": [loss.item()],
    "Training time": [training_time],
    "Syntactic Accuracy": [syntactic_acc],
    "Semantic accuracy": [semantic_acc]
}

df_skipgram_neg = pd.DataFrame(results)
df_skipgram_neg


Unnamed: 0,Model,Window Size,Training Loss,Training time,Syntactic Accuracy,Semantic accuracy
0,Skipgram (NEG),2,2.68448,4.103402,0.0,0.0


In [None]:
import os
import json

def export_embeddings_and_corpus( #export_embeddings_and_corpus 
    output_dir: str,
    use_normalized: bool = True,
    top_k: int = None,
    include_sentences: bool = False,
    file_prefix: str = "skipgram"
):
    """
    Export embeddings and corpus to JSON files for web consumption.

    Args:
        output_dir: Directory to write output files.
        use_normalized: If True, use W_norm; otherwise use W.
        top_k: If set, export only the top_k most frequent words.
        include_sentences: If True, also export the tokenized sentences.
        file_prefix: Prefix for output file names.

    Writes:
        - {file_prefix}_embeddings.json: list of {word, vector}
        - {file_prefix}_corpus.json: {tokens, vocab_size, emb_size, normalized, stats, ...}
        - {file_prefix}_sentences.json (optional): list of token lists
    Returns:
        dict with written file paths.
    """
    os.makedirs(output_dir, exist_ok=True)

    E = W_norm if use_normalized else W  # uses existing variables
    emb_size_local = E.shape[1]

    # Determine which words to export
    if top_k is not None and top_k > 0 and 'word_count' in globals():
        # Top-k by frequency using existing word_count
        most_common = [w for w, _ in word_count.most_common(top_k)]
        # Ensure words exist in current vocab/index
        selected = [(word2index[w], w) for w in most_common if w in word2index]
    else:
        # Full vocab in index order
        selected = [(i, index2word[i]) for i in range(E.shape[0]) if i in index2word]

    # Build embeddings payload
    embeddings_payload = [
        {"word": w, "vector": E[i].tolist()}
        for i, w in selected
    ]

    embeddings_path = os.path.join(output_dir, f"{file_prefix}_embeddings.json")
    with open(embeddings_path, "w", encoding="utf-8") as f:
        json.dump(embeddings_payload, f, ensure_ascii=False)

    # Corpus/meta payload
    corpus_payload = {
        "tokens": corpus,  # flattened tokens for web search/index
        "vocab_size": len(vocabs),
        "emb_size": emb_size,
        "normalized": use_normalized,
        "selected_count": len(embeddings_payload),
        "stats": {
            "training_time": float(training_time),
            "final_loss": float(loss.item()) if isinstance(loss, (float, int)) else float(loss.item()),
            "spearman_correlation": float(correlation),
            "spearman_p_value": float(p_value),
            "syntactic_accuracy": float(syntactic_acc),
            "semantic_accuracy": float(semantic_acc),
        }
    }

    corpus_path = os.path.join(output_dir, f"{file_prefix}_corpus.json")
    with open(corpus_path, "w", encoding="utf-8") as f:
        json.dump(corpus_payload, f, ensure_ascii=False)

    sentences_path = None
    if include_sentences and 'sentences' in globals():
        sentences_path = os.path.join(output_dir, f"{file_prefix}_sentences.json")
        with open(sentences_path, "w", encoding="utf-8") as f:
            json.dump(sentences, f, ensure_ascii=False)

    return {
        "embeddings": embeddings_path,
        "corpus": corpus_path,
        "sentences": sentences_path
    }

In [None]:
export_embeddings_and_corpus( #export_embeddings_and_corpus
    output_dir="output_skipgram_neg",
    use_normalized=True,
    top_k=5000,
    include_sentences=True,
    file_prefix="skipgram_neg")

{'embeddings': 'output_skipgram_neg\\skipgram_neg_embeddings.json',
 'corpus': 'output_skipgram_neg\\skipgram_neg_corpus.json',
 'sentences': 'output_skipgram_neg\\skipgram_neg_sentences.json'}