# Day 3: Embeddings - From Tokens to Vectors

In this notebook, we'll explore how to transform discrete tokens into dense vector representations that capture semantic meaning. We'll implement both one-hot and learned embeddings, explore similarity measures, and visualize how embeddings encode relationships between words.

## Setup and Imports

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import seaborn as sns

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. One-hot Embeddings

One-hot encoding represents each token as a sparse vector with exactly one 1 and all other elements as 0. Let's implement a simple one-hot embedding class.

In [None]:
class OneHotEmbedding:
    """Simple one-hot embedding implementation."""
    
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.embedding_dim = vocab_size  # Same as vocab size for one-hot
    
    def encode(self, token_ids):
        """Convert token IDs to one-hot vectors."""
        if isinstance(token_ids, int):
            token_ids = [token_ids]
        
        one_hot = np.zeros((len(token_ids), self.vocab_size))
        for i, token_id in enumerate(token_ids):
            if 0 <= token_id < self.vocab_size:
                one_hot[i, token_id] = 1.0
        
        return one_hot
    
    def similarity(self, token_id1, token_id2):
        """Compute similarity between two tokens."""
        vec1 = self.encode([token_id1])[0]
        vec2 = self.encode([token_id2])[0]
        
        # Cosine similarity for one-hot is always 0 (orthogonal) or 1 (identical)
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

Let's test our one-hot embedding implementation:

In [None]:
# Create a small vocabulary for demonstration
vocab = {
    'the': 0,
    'cat': 1,
    'dog': 2,
    'sat': 3,
    'on': 4,
    'mat': 5,
    'ran': 6,
    'in': 7,
    'park': 8,
    'house': 9
}

# Create one-hot embedding
vocab_size = len(vocab)
one_hot_emb = OneHotEmbedding(vocab_size)

# Encode some tokens
tokens = [vocab['the'], vocab['cat'], vocab['dog'], vocab['mat']]
embeddings = one_hot_emb.encode(tokens)

print("One-hot embeddings:")
for i, token_id in enumerate(tokens):
    token_name = [k for k, v in vocab.items() if v == token_id][0]
    print(f"Token '{token_name}' (ID {token_id}): {embeddings[i]}")

# Check similarity
sim = one_hot_emb.similarity(vocab['cat'], vocab['dog'])
print(f"\nSimilarity between 'cat' and 'dog': {sim}")

sim_same = one_hot_emb.similarity(vocab['cat'], vocab['cat'])
print(f"Similarity between 'cat' and itself: {sim_same}")

### Visualizing One-hot Embeddings

Let's visualize our one-hot embeddings to understand their structure:

In [None]:
def visualize_one_hot(vocab, one_hot_emb):
    # Get embeddings for all words
    words = list(vocab.keys())
    token_ids = [vocab[word] for word in words]
    embeddings = one_hot_emb.encode(token_ids)
    
    # Create a heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(embeddings, cmap='Blues', cbar=True, 
                xticklabels=range(vocab_size), 
                yticklabels=words)
    plt.title('One-hot Encodings')
    plt.xlabel('Dimension')
    plt.ylabel('Word')
    plt.show()
    
    # Create similarity matrix
    similarity_matrix = np.zeros((len(words), len(words)))
    for i, word1 in enumerate(words):
        for j, word2 in enumerate(words):
            similarity_matrix[i, j] = one_hot_emb.similarity(vocab[word1], vocab[word2])
    
    # Plot similarity matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_matrix, annot=True, cmap='YlGnBu', 
                xticklabels=words, yticklabels=words)
    plt.title('One-hot Encoding Similarity Matrix')
    plt.show()

# Visualize one-hot embeddings
visualize_one_hot(vocab, one_hot_emb)

### Limitations of One-hot Embeddings

Let's analyze the limitations of one-hot embeddings:

In [None]:
def analyze_one_hot_limitations(vocab_sizes):
    # Analyze memory usage and dimensionality
    results = []
    for size in vocab_sizes:
        # Memory usage (bytes) for float32
        memory_bytes = size * size * 4  # 4 bytes per float32
        memory_mb = memory_bytes / (1024 * 1024)
        
        # Sparsity (percentage of zeros)
        sparsity = (size - 1) / size * 100
        
        results.append({
            'vocab_size': size,
            'memory_mb': memory_mb,
            'sparsity': sparsity
        })
    
    # Create a DataFrame
    import pandas as pd
    df = pd.DataFrame(results)
    
    # Plot results
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Memory usage
    ax1.plot(df['vocab_size'], df['memory_mb'], marker='o', linewidth=2)
    ax1.set_title('Memory Usage vs. Vocabulary Size')
    ax1.set_xlabel('Vocabulary Size')
    ax1.set_ylabel('Memory (MB)')
    ax1.grid(True, alpha=0.3)
    
    # Sparsity
    ax2.plot(df['vocab_size'], df['sparsity'], marker='o', linewidth=2, color='green')
    ax2.set_title('Sparsity vs. Vocabulary Size')
    ax2.set_xlabel('Vocabulary Size')
    ax2.set_ylabel('Sparsity (%)')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return df

# Analyze one-hot limitations with different vocabulary sizes
vocab_sizes = [10, 100, 1000, 10000, 50000]
limitation_results = analyze_one_hot_limitations(vocab_sizes)
print(limitation_results)

## 2. Learned Embeddings

Learned embeddings map tokens to dense, low-dimensional vectors that can capture semantic relationships. Let's implement a learned embedding layer using PyTorch.

In [None]:
class LearnedEmbedding(nn.Module):
    """Learned embedding layer implementation."""
    
    def __init__(self, vocab_size, embedding_dim, padding_idx=None):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # Initialize embedding matrix
        self.embedding = nn.Embedding(
            vocab_size, 
            embedding_dim, 
            padding_idx=padding_idx
        )
        
        # Initialize with small random values
        nn.init.normal_(self.embedding.weight, mean=0, std=0.1)
        
        if padding_idx is not None:
            # Zero out padding token embedding
            self.embedding.weight.data[padding_idx].fill_(0)
    
    def forward(self, token_ids):
        """Forward pass through embedding layer."""
        return self.embedding(token_ids)
    
    def get_embedding(self, token_id):
        """Get embedding for a specific token."""
        return self.embedding.weight[token_id].detach().numpy()
    
    def similarity(self, token_id1, token_id2):
        """Compute cosine similarity between two token embeddings."""
        emb1 = self.get_embedding(token_id1)
        emb2 = self.get_embedding(token_id2)
        
        return cosine_similarity([emb1], [emb2])[0][0]
    
    def most_similar(self, token_id, top_k=5):
        """Find most similar tokens to given token."""
        target_emb = self.get_embedding(token_id)
        all_embeddings = self.embedding.weight.detach().numpy()
        
        similarities = cosine_similarity([target_emb], all_embeddings)[0]
        
        # Get top-k most similar (excluding the token itself)
        similar_indices = np.argsort(similarities)[::-1]
        similar_indices = similar_indices[similar_indices != token_id][:top_k]
        
        return [(idx, similarities[idx]) for idx in similar_indices]

Let's test our learned embedding implementation:

In [None]:
# Create learned embedding
embedding_dim = 8  # Small dimension for visualization
learned_emb = LearnedEmbedding(vocab_size, embedding_dim)

# Test forward pass
token_ids = torch.tensor([vocab['the'], vocab['cat'], vocab['dog'], vocab['mat']])
embeddings = learned_emb(token_ids)

print(f"Learned embeddings shape: {embeddings.shape}")
print("\nEmbedding values:")
for i, token_id in enumerate(token_ids):
    token_name = [k for k, v in vocab.items() if v == token_id.item()][0]
    print(f"Token '{token_name}': {embeddings[i].detach().numpy()}")

# Check similarity
sim = learned_emb.similarity(vocab['cat'], vocab['dog'])
print(f"\nSimilarity between 'cat' and 'dog': {sim:.4f}")

sim_same = learned_emb.similarity(vocab['cat'], vocab['cat'])
print(f"Similarity between 'cat' and itself: {sim_same:.4f}")

### Visualizing Learned Embeddings

Let's visualize our learned embeddings:

In [None]:
def visualize_learned_embeddings(vocab, learned_emb):
    # Get embeddings for all words
    words = list(vocab.keys())
    token_ids = [vocab[word] for word in words]
    
    # Get embeddings
    embeddings = np.array([learned_emb.get_embedding(token_id) for token_id in token_ids])
    
    # Create a heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(embeddings, cmap='coolwarm', cbar=True, 
                xticklabels=range(learned_emb.embedding_dim), 
                yticklabels=words)
    plt.title('Learned Embeddings')
    plt.xlabel('Dimension')
    plt.ylabel('Word')
    plt.show()
    
    # Create similarity matrix
    similarity_matrix = np.zeros((len(words), len(words)))
    for i, word1 in enumerate(words):
        for j, word2 in enumerate(words):
            similarity_matrix[i, j] = learned_emb.similarity(vocab[word1], vocab[word2])
    
    # Plot similarity matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_matrix, annot=True, fmt='.2f', cmap='YlGnBu', 
                xticklabels=words, yticklabels=words)
    plt.title('Learned Embedding Similarity Matrix')
    plt.show()
    
    # PCA visualization if we have more than 2 dimensions
    if learned_emb.embedding_dim > 2:
        pca = PCA(n_components=2)
        embeddings_2d = pca.fit_transform(embeddings)
        
        plt.figure(figsize=(10, 8))
        plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.7)
        
        # Add labels
        for i, word in enumerate(words):
            plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]), 
                        fontsize=12, alpha=0.8)
        
        plt.title('PCA of Learned Embeddings')
        plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
        plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
        plt.grid(True, alpha=0.3)
        plt.show()

# Visualize learned embeddings
visualize_learned_embeddings(vocab, learned_emb)