# Word2Vec (Skipgram )

# We use the NLTK Brown corpus as a real-world dataset.
# The corpus is tokenized into sentences and words, and all tokens
# are converted to lowercase to reduce vocabulary size.
# This preprocessing step is important to ensure consistent embeddings.


In [None]:
#importing Libraries
import torch
import numpy as np
import matplotlib.pyplot as plt
import nltk
import torch.nn as nn
import torch.optim as optim
import matplotlib
import time
import nltk.corpus 
from nltk.corpus import brown


In [2]:
np.__version__, torch.__version__, matplotlib.__version__

('2.0.2', '2.8.0+cpu', '3.9.4')

## 1. Load data

# Load the Brown corpus from NLTK as a real-world text dataset.
# This corpus contains texts from multiple genres, which helps
# the model learn more diverse word contexts.


In [3]:
# Load Reuters corpus (real-world data for final training)
nltk.download('brown')
nltk.download('punkt')
sentences = brown.sents(categories="news")
sentences = [[word.lower() for word in sent] for sent in sentences]



[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\aashu\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aashu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Dataset

We use the Brown news dataset provided by the NLTK library.  
This dataset contains news articles across multiple topics and is commonly used for NLP research.
Using this dataset allows us to train word embeddings on real-world text instead of toy examples.


In [4]:
# Step 2: Tokenize Reuters corpus

tokenized_sentences = sentences

print("Number of tokenized sentences:", len(tokenized_sentences))
print("Sample tokenized sentence (first 30 words):")
print(tokenized_sentences[0][:30])

Number of tokenized sentences: 4623
Sample tokenized sentence (first 30 words):
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


# Generate (target, context) word pairs for Skip-gram training.
# For each target word, surrounding context words are selected
# using a dynamic window size (default = 2).


In [5]:
def generate_context(sentence, center_idx, window_size=2):
    """
    Generate context words for a given center word index using a dynamic window size.
    """
    start = max(0, center_idx - window_size)
    end = min(len(sentence), center_idx + window_size + 1)
    return [sentence[j] for j in range(start, end) if j != center_idx]


def generate_skipgram_pairs(sentences, window_size=2):
    """
    Generate (center, context) pairs for Skip-gram training.
    """
    pairs = []
    for sentence in sentences:
        for i, center in enumerate(sentence):
            context_words = generate_context(sentence, i, window_size)
            for context in context_words:
                pairs.append((center, context))
    return pairs


# Default window size
window_size = 2

# Generate Skip-gram training pairs
pairs = generate_skipgram_pairs(tokenized_sentences, window_size)
print("Number of (center, context) pairs generated:", len(pairs))
print("Sample (center, context) pairs:", pairs[:10])


Number of (center, context) pairs generated: 374548
Sample (center, context) pairs: [('The', 'Fulton'), ('The', 'County'), ('Fulton', 'The'), ('Fulton', 'County'), ('Fulton', 'Grand'), ('County', 'The'), ('County', 'Fulton'), ('County', 'Grand'), ('County', 'Jury'), ('Grand', 'Fulton')]


## 2. Prepare train data

In [6]:
def random_batch(batch_size, sentences, word2index, window_size=2):
    """
    Generate a random batch of (center, context) word index pairs
    using a dynamic window size.
    """
    inputs = []
    labels = []

    while len(inputs) < batch_size:
        # Randomly choose a sentence
        sentence = sentences[np.random.randint(len(sentences))]
        
        # Randomly choose a center word index
        center_idx = np.random.randint(len(sentence))
        center_word = sentence[center_idx]
        center = word2index.get(center_word, word2index["<UNK>"])

        # Generate context words dynamically
        context_words = generate_context(sentence, center_idx, window_size)

        for context_word in context_words:
            if len(inputs) >= batch_size:
                break
            context = word2index.get(context_word, word2index["<UNK>"])
            inputs.append(center)
            labels.append(context)

    return np.array(inputs), np.array(labels)


## 3. Model

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

In [7]:
class Skipgram(nn.Module):
    def __init__(self, voc_size, emb_size):
        super().__init__()
        self.embedding_center = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)

    def forward(self, center, outside, all_vocabs):
        """
        center: (batch_size, 1)
        outside: (batch_size, 1)
        all_vocabs: (batch_size, voc_size)
        """
        v_c = self.embedding_center(center)          # (B, 1, D)
        u_o = self.embedding_outside(outside)        # (B, 1, D)
        u_all = self.embedding_outside(all_vocabs)   # (B, V, D)

        # Positive score
        score = torch.bmm(u_o, v_c.transpose(1, 2)).squeeze(2)

        # All vocabulary scores
        all_scores = torch.bmm(u_all, v_c.transpose(1, 2)).squeeze(2)

        log_probs = torch.log_softmax(all_scores, dim=1)

        loss = -torch.mean(log_probs.gather(1, outside))
        return loss


In [8]:
unique_words = set()
for doc in tokenized_sentences:
    unique_words.update(doc)

vocabs = list(unique_words)
vocabs.append("<UNK>")

word2index = {word: idx for idx, word in enumerate(vocabs)}
tokenized_corpus = tokenized_sentences


In [9]:
voc_size = len(vocabs)
emb_size = 8   # recommended: 50 (fast) or 100 (better)


In [10]:
def prepare_all_vocabs(vocabs, word2index, batch_size):
    """
    Prepare a tensor containing all vocabulary indices,
    expanded for batch-wise full softmax computation.
    """
    idxs = [word2index.get(w, word2index["<UNK>"]) for w in vocabs]
    all_vocab_tensor = torch.LongTensor(idxs).unsqueeze(0)
    return all_vocab_tensor.expand(batch_size, len(vocabs))


## 4. Training

# Train the Skip-gram model over multiple epochs.
# The training loss is tracked to monitor convergence
# and evaluate how well the embeddings are learned.


In [11]:
model = Skipgram(voc_size, emb_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [12]:
model.train()
num_epochs = 1000
batch_size = 8
window_size = 2  # DEFAULT (assignment requirement)




In [15]:
start_time = time.time()
loss_history = []

num_batches = 100  # debug limit (increase later)

for epoch in range(num_epochs):
    epoch_loss = 0.0

    for train_epoch in range(num_batches):
        input_batch, label_batch = random_batch(
            batch_size,
            tokenized_corpus,
            word2index,
            window_size
        )

        input_tensor = torch.LongTensor(input_batch).unsqueeze(1)
        label_tensor = torch.LongTensor(label_batch).unsqueeze(1)

        all_vocabs = prepare_all_vocabs(vocabs, word2index, batch_size)

        loss = model(input_tensor, label_tensor, all_vocabs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / num_batches
    loss_history.append(avg_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {avg_loss:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")


Epoch [1/1000] - Loss: 7.6487
Epoch [2/1000] - Loss: 7.4810
Epoch [3/1000] - Loss: 7.6563
Epoch [4/1000] - Loss: 7.4523
Epoch [5/1000] - Loss: 7.4586
Epoch [6/1000] - Loss: 7.6114
Epoch [7/1000] - Loss: 7.4958
Epoch [8/1000] - Loss: 7.5064
Epoch [9/1000] - Loss: 7.6241
Epoch [10/1000] - Loss: 7.7356
Epoch [11/1000] - Loss: 7.5463
Epoch [12/1000] - Loss: 7.4844
Epoch [13/1000] - Loss: 7.3391
Epoch [14/1000] - Loss: 7.4598
Epoch [15/1000] - Loss: 7.3833
Epoch [16/1000] - Loss: 7.4574
Epoch [17/1000] - Loss: 7.6435
Epoch [18/1000] - Loss: 7.4926
Epoch [19/1000] - Loss: 7.5898
Epoch [20/1000] - Loss: 7.3970
Epoch [21/1000] - Loss: 7.3083
Epoch [22/1000] - Loss: 7.6136
Epoch [23/1000] - Loss: 7.4591
Epoch [24/1000] - Loss: 7.6034
Epoch [25/1000] - Loss: 7.5899
Epoch [26/1000] - Loss: 7.2284
Epoch [27/1000] - Loss: 7.3611
Epoch [28/1000] - Loss: 7.6273
Epoch [29/1000] - Loss: 7.5318
Epoch [30/1000] - Loss: 7.4335
Epoch [31/1000] - Loss: 7.4373
Epoch [32/1000] - Loss: 7.2315
Epoch [33/1000] -

## 5. Experiments

In [23]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity


In [25]:



# Step 1: build final embedding matrix
W = (model.embedding_center.weight.detach().cpu().numpy() +
     model.embedding_outside.weight.detach().cpu().numpy()) / 2

# Normalize embeddings for fast cosine similarity
W_norm = W / np.linalg.norm(W, axis=1, keepdims=True)



def get_vector(word):
    if word not in word2index:
        return None
    return W_norm[word2index[word]] 


In [26]:
index2word = {idx: word for word, idx in word2index.items()}
def predict_analogy(a, b, c, W_norm, word2index, index2word):
    if a not in word2index or b not in word2index or c not in word2index:
        return None

    va = W_norm[word2index[a]]
    vb = W_norm[word2index[b]]
    vc = W_norm[word2index[c]]

    # Vector arithmetic: b - a + c
    target = vb - va + vc
    target = target / np.linalg.norm(target)

    # Cosine similarity with ALL words at once
    similarities = np.dot(W_norm, target)

    # Exclude input words
    for w in (a, b, c):
        similarities[word2index[w]] = -1

    best_index = np.argmax(similarities)
    return index2word[best_index]



In [27]:

def evaluate_analogies(file_path, W_norm, word2index, index2word):
    total = 0
    correct = 0

    with open(file_path, "r") as f:
        for line in f:
            words = line.strip().split()
            if len(words) != 4:
                continue

            a, b, c, d = words
            prediction = predict_analogy(
                a, b, c, W_norm, word2index, index2word
            )

            if prediction is None:
                continue

            total += 1
            if prediction == d:
                correct += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy, correct, total


Semantic and Syntactic Test

In [28]:
semantic_acc, sem_correct, sem_total = evaluate_analogies(
    "country-capital.txt",
    W,
    word2index,
    index2word
)


In [29]:
syntactic_acc, syn_correct, syn_total = evaluate_analogies(
    "past-tense.txt",
    W,
    word2index,
    index2word
)

print(f"Semantic accuracy (capital-common-countries): {semantic_acc:.4f} ({sem_correct}/{sem_total})")
print(f"Syntactic accuracy (past-tense): {syntactic_acc:.4f} ({syn_correct}/{syn_total})")


Semantic accuracy (capital-common-countries): 0.0000 (0/144)
Syntactic accuracy (past-tense): 0.0000 (0/648)


In [None]:
# !pip install pandas


In [30]:
import pandas as pd

similarity_df = pd.read_csv("combined.csv")
similarity_df.head()


Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.0
3,book,paper,7.46
4,computer,keyboard,7.62


In [31]:
model_scores = []
human_scores = []
skipped = 0

for _, row in similarity_df.iterrows():
    w1 = row["Word 1"]
    w2 = row["Word 2"]
    human_score = row["Human (mean)"]

    if w1 not in word2index or w2 not in word2index:
        skipped += 1
        continue

    v1 = W_norm[word2index[w1]]
    v2 = W_norm[word2index[w2]]

    model_sim = np.dot(v1, v2)  # cosine similarity

    model_scores.append(model_sim)
    human_scores.append(human_score)

print(f"Used pairs: {len(model_scores)}")
print(f"Skipped OOV pairs: {skipped}")


Used pairs: 191
Skipped OOV pairs: 162


Spearman Similarity Test

In [None]:
# !pip install scipy

In [35]:

from scipy.stats import spearmanr

correlation, p_value = spearmanr(model_scores, human_scores)

print(f"Spearman correlation: {correlation:.4f}")
print(f"P-value: {p_value:.4e}")

Spearman correlation: -0.0602
P-value: 4.0807e-01


In [36]:
print(" comparisons:")
for i in range(5):
    print(
        similarity_df.iloc[i, 0],
        similarity_df.iloc[i, 1],
        "Human:", human_scores[i],
        "Model:", round(model_scores[i], 3)
    )


 comparisons:
love sex Human: 6.77 Model: 0.943
tiger cat Human: 7.46 Model: 0.893
tiger tiger Human: 5.77 Model: 0.919
book paper Human: 6.31 Model: 0.99
computer keyboard Human: 7.5 Model: 0.943


In [45]:
import pandas as pd

results = {
    "Model": ["Skipgram"],
    "Window Size": [window_size],
    "Training Loss": [avg_loss],
    "Training time": [end_time - start_time],
    "Syntactic Accuracy": [syntactic_acc],
    "Semantic accuracy": [semantic_acc]
}

df_skipgram = pd.DataFrame(results)
df_skipgram


Unnamed: 0,Model,Window Size,Training Loss,Training time,Syntactic Accuracy,Semantic accuracy
0,Skipgram,2,7.503079,1494.55755,0.0,0.0
