# 🧠 Word2Vec from Scratch (Skip-gram, PyTorch Version)

This notebook implements a minimal Word2Vec Skip-gram model without `gensim`, using PyTorch. It demonstrates training word embeddings on a small corpus with full visibility into the math.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict
import random

## 📄 Toy Corpus

In [None]:
corpus = [
    "thank you very much",
    "thank you for coming",
    "you are welcome",
    "thank you again"
]

## 🔠 Tokenize and Create Vocabulary

In [None]:
# Tokenize corpus
tokenized = [sentence.split() for sentence in corpus]
vocab = sorted(set(word for sentence in tokenized for word in sentence))
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}
vocab_size = len(vocab)

print("Vocabulary:", word2idx)

## 🧾 Generate Training Data (Skip-gram pairs)

In [None]:
window_size = 2
training_data = []

for sentence in tokenized:
    indices = [word2idx[word] for word in sentence]
    for center_pos in range(len(indices)):
        for offset in range(-window_size, window_size + 1):
            context_pos = center_pos + offset
            if context_pos < 0 or context_pos >= len(indices) or context_pos == center_pos:
                continue
            center = indices[center_pos]
            context = indices[context_pos]
            training_data.append((center, context))

print(f"Total training pairs: {len(training_data)}\nExample:", training_data[:5])

## 🧠 Define Word2Vec Model

In [None]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_words):
        return self.in_embed(center_words)

    def predict(self, center_words):
        center_vecs = self.in_embed(center_words)  # shape: (batch, embed_dim)
        context_vecs = self.out_embed.weight       # shape: (vocab_size, embed_dim)
        scores = torch.matmul(center_vecs, context_vecs.T)  # shape: (batch, vocab_size)
        return scores

## 🔧 Train Word2Vec Model

In [None]:
embedding_dim = 10
model = Word2Vec(vocab_size, embedding_dim).cuda()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Convert data to tensors
training_pairs = [(torch.tensor([c]).cuda(), torch.tensor([t]).cuda()) for c, t in training_data]

for epoch in range(50):
    total_loss = 0
    random.shuffle(training_pairs)
    for center, target in training_pairs:
        logits = model.predict(center)  # shape: (1, vocab_size)
        loss = loss_fn(logits, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

## 🔍 View Trained Embeddings

## 📊 Visualize Word Embeddings (2D Projection using PCA)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Reduce dimensions to 2D using PCA
pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings)

# Plot
plt.figure(figsize=(8, 6))
for i, word in idx2word.items():
    x, y = reduced[i]
    plt.scatter(x, y)
    plt.text(x + 0.01, y + 0.01, word, fontsize=12)
plt.title("Word Embeddings (PCA Projection)")
plt.grid(True)
plt.show()

In [None]:
embeddings = model.in_embed.weight.data.cpu()

for i, word in idx2word.items():
    vec = embeddings[i].numpy().round(3)
    print(f"{word:10s}: {vec}")