## Continuous Bag of Words (CBOW) Implementation

This notebook implements the Continuous Bag of Words (CBOW) model for Word2Vec embeddings.

### Overview
- **CBOW**: Predicts a target word from surrounding context words
- **Dataset**: text8 corpus (cleaned Wikipedia articles)
- **Framework**: PyTorch


In [16]:
# Installing the necessary libraries
# Using python -m pip ensures packages are installed for the current kernel's Python version
!pip3 install -U gensim matplotlib scikit-learn torch
# Note: zipfile is a built-in Python module, no installation needed


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [30]:
# Download the text8 corpus (cleaned Wikipedia articles by Matt Mahoney)
# Using curl instead of wget (macOS compatible)
import os
if not os.path.exists('text8'):
    if not os.path.exists('text8.zip'):
        print("Downloading text8 corpus...")
        !curl -O https://s3.amazonaws.com/video.udacity-data.com/topher/2018/October/5bbe6499_text8/text8.zip
    print("Extracting text8 corpus...")
    !unzip -q text8.zip
    print("Download and extraction complete!")
else:
    print("text8 corpus already exists.")

text8 corpus already exists.


In [32]:
# Configure matplotlib for inline plotting
%matplotlib inline
%config InlineBackend.figure_format = "retina"

# Standard library imports
import time
import random
from collections import Counter

# Third-party imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"NumPy version: {np.__version__}")

All libraries imported successfully!
PyTorch version: 2.9.1
NumPy version: 2.3.4


### Load and Preprocess the Data


In [33]:
# Load the text8 corpus
with open('text8', 'r') as f:
    text = f.read()
# Split into words
words = text.split()
print(f"Total words in corpus: {len(words):,}")
print(f"First 50 words: {' '.join(words[:50])}")


Total words in corpus: 17,005,207
First 50 words: anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the


### Build Vocabulary

In [34]:
# Build vocabulary by counting word frequencies
# Remove rare words (appearing less than min_count times)
min_count = 5
word_counts = Counter(words)
vocab = {word: count for word, count in word_counts.items() if count >= min_count}
# Create word to index and index to word mappings
word_to_idx = {word: idx for idx, word in enumerate(vocab.keys())}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size:,}")
print(f"Total unique words (before filtering): {len(word_counts):,}")
print(f"Words removed (count < {min_count}): {len(word_counts) - vocab_size:,}")
print(f"\nSample vocabulary words: {list(vocab.keys())[:20]}")


Vocabulary size: 71,290
Total unique words (before filtering): 253,854
Words removed (count < 5): 182,564

Sample vocabulary words: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'english', 'revolution', 'and']


### Prepare Training Data


In [35]:
# Convert words to indices, filtering out words not in vocabulary
words_idx = [word_to_idx[word] for word in words if word in word_to_idx]
# Create context-target pairs for CBOW
# CBOW: context words -> target word
window_size = 2  # 2 words before and 2 words after the target
def create_cbow_data(words_idx, window_size):
    """Create (context, target) pairs for CBOW training."""
    context_target_pairs = []
    
    for i in range(window_size, len(words_idx) - window_size):
        # Context: words around the target
        context = words_idx[i-window_size:i] + words_idx[i+1:i+window_size+1]
        # Target: word at position i
        target = words_idx[i]
        context_target_pairs.append((context, target))
    
    return context_target_pairs

# Create training pairs
print("Creating training pairs...")
train_data = create_cbow_data(words_idx, window_size)
print(f"Total training pairs: {len(train_data):,}")
print(f"Sample pair: context={[idx_to_word[idx] for idx in train_data[0][0]]}, target={idx_to_word[train_data[0][1]]}")


Creating training pairs...
Total training pairs: 16,718,840
Sample pair: context=['anarchism', 'originated', 'a', 'term'], target=as


### Define CBOW Model


In [36]:
class CBOW(nn.Module):
    """Continuous Bag of Words model for Word2Vec."""
    
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        # Embedding layer: maps word indices to dense vectors
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # Linear layer: maps from embedding space to vocabulary
        self.linear = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, context):
        """
        Forward pass.
        Args:
            context: Tensor of shape (batch_size, context_size) containing context word indices
        Returns:
            logits: Tensor of shape (batch_size, vocab_size)
        """
        # Average the context word embeddings
        # context shape: (batch_size, context_size)
        embeds = self.embeddings(context)  # (batch_size, context_size, embedding_dim)
        embeds = embeds.mean(dim=1)  # (batch_size, embedding_dim) - average pooling
        # Project to vocabulary size
        out = self.linear(embeds)  # (batch_size, vocab_size)
        return out

# Model hyperparameters
embedding_dim = 100
model = CBOW(vocab_size, embedding_dim)
print(f"Model created with {vocab_size:,} vocabulary size and {embedding_dim} embedding dimensions")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")


Model created with 71,290 vocabulary size and 100 embedding dimensions
Total parameters: 14,329,290


### Training Setup


In [38]:
# Training hyperparameters
learning_rate = 0.001
num_epochs = 1
batch_size = 128

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create data loader
from torch.utils.data import Dataset, DataLoader

class CBOWDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        context, target = self.data[idx]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

# Create dataset and data loader
dataset = CBOWDataset(train_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print(f"Training setup complete!")
print(f"Epochs: {num_epochs}")
print(f"Batch size: {batch_size}")
print(f"Learning rate: {learning_rate}")
print(f"Total batches per epoch: {len(dataloader):,}")


Training setup complete!
Epochs: 1
Batch size: 128
Learning rate: 0.001
Total batches per epoch: 130,616


### Train the Model


In [None]:
# Training loop
model.train()
losses = []

print("Starting training...")
for epoch in range(num_epochs):
    total_loss = 0
    start_time = time.time()
    
    for batch_idx, (context_batch, target_batch) in enumerate(dataloader):
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        logits = model(context_batch)
        
        # Calculate loss
        loss = criterion(logits, target_batch)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        # Print progress every 10000 batches
        if (batch_idx + 1) % 10000 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}/{len(dataloader)}, Loss: {loss.item():.4f}")
    
    avg_loss = total_loss / len(dataloader)
    losses.append(avg_loss)
    elapsed_time = time.time() - start_time
    
    print(f"Epoch {epoch+1}/{num_epochs} completed in {elapsed_time:.2f}s, Average Loss: {avg_loss:.4f}")
    print("-" * 50)

print("Training complete!")


Starting training...
Epoch 1/1, Batch 10000/130616, Loss: 6.6782


### Visualize Training Loss


In [None]:
# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs + 1), losses, marker='o', linewidth=2, markersize=8)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Average Loss', fontsize=12)
plt.title('CBOW Training Loss', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


### Extract Word Embeddings


In [None]:
# Extract learned word embeddings
model.eval()
embeddings = model.embeddings.weight.data.cpu().numpy()

print(f"Embedding matrix shape: {embeddings.shape}")
print(f"Sample embedding for 'the': {embeddings[word_to_idx.get('the', 0)][:10]}...")


### Visualize Word Embeddings (t-SNE)


In [None]:
# Visualize embeddings using t-SNE
# Select a subset of common words for visualization
common_words = ['the', 'of', 'and', 'in', 'to', 'a', 'is', 'that', 'for', 'it',
                'as', 'was', 'with', 'be', 'on', 'not', 'he', 'i', 'this', 'but',
                'from', 'they', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would',
                'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get']

# Get embeddings for common words
word_list = []
embedding_list = []

for word in common_words:
    if word in word_to_idx:
        word_list.append(word)
        embedding_list.append(embeddings[word_to_idx[word]])

if len(embedding_list) > 0:
    embedding_array = np.array(embedding_list)
    
    # Apply t-SNE for dimensionality reduction to 2D
    print("Applying t-SNE...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(word_list)-1))
    embeddings_2d = tsne.fit_transform(embedding_array)
    
    # Plot
    plt.figure(figsize=(14, 10))
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.6)
    
    # Annotate points with words
    for i, word in enumerate(word_list):
        plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]), 
                    fontsize=9, alpha=0.7)
    
    plt.title('Word Embeddings Visualization (t-SNE)', fontsize=14, fontweight='bold')
    plt.xlabel('t-SNE Dimension 1', fontsize=12)
    plt.ylabel('t-SNE Dimension 2', fontsize=12)
    plt.tight_layout()
    plt.show()
else:
    print("No common words found in vocabulary for visualization.")


### Find Similar Words


In [None]:
def find_similar_words(word, embeddings, word_to_idx, idx_to_word, top_k=10):
    """Find most similar words using cosine similarity."""
    if word not in word_to_idx:
        print(f"Word '{word}' not in vocabulary.")
        return
    
    # Get embedding for the query word
    word_idx = word_to_idx[word]
    word_embedding = embeddings[word_idx]
    
    # Calculate cosine similarity with all words
    # Normalize embeddings
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    normalized_embeddings = embeddings / (norms + 1e-8)
    word_embedding_norm = word_embedding / (np.linalg.norm(word_embedding) + 1e-8)
    
    # Compute cosine similarities
    similarities = np.dot(normalized_embeddings, word_embedding_norm)
    
    # Get top k similar words (excluding the word itself)
    top_indices = np.argsort(similarities)[::-1][1:top_k+1]
    
    print(f"Words similar to '{word}':")
    for i, idx in enumerate(top_indices, 1):
        print(f"{i}. {idx_to_word[idx]} (similarity: {similarities[idx]:.4f})")

# Test with some example words
test_words = ['king', 'queen', 'man', 'woman', 'good', 'bad']
for word in test_words:
    if word in word_to_idx:
        find_similar_words(word, embeddings, word_to_idx, idx_to_word)
        print()
