# Day 3: Embeddings - Part 4: Pre-trained Embeddings and Practical Exercises

In this notebook, we'll explore pre-trained embeddings and work through practical exercises to deepen our understanding of embeddings.

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import seaborn as sns
import pandas as pd

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 5. Pre-trained Embeddings

Let's explore pre-trained embeddings like Word2Vec and GloVe. For this notebook, we'll simulate pre-trained embeddings with a small vocabulary to demonstrate the concepts.

In [None]:
def demonstrate_word2vec_concepts():
    """Demonstrate key Word2Vec concepts with simple examples."""
    
    # Simulate Word2Vec-style relationships
    # In real Word2Vec, these emerge from training
    word_vectors = {
        'king': np.array([0.8, 0.2, 0.9, 0.1]),
        'queen': np.array([0.7, 0.8, 0.9, 0.2]),
        'man': np.array([0.9, 0.1, 0.2, 0.1]),
        'woman': np.array([0.8, 0.9, 0.2, 0.2]),
        'prince': np.array([0.85, 0.15, 0.8, 0.1]),
        'princess': np.array([0.75, 0.85, 0.8, 0.2]),
        'uncle': np.array([0.7, 0.1, 0.3, 0.1]),
        'aunt': np.array([0.6, 0.9, 0.3, 0.2]),
        'cat': np.array([0.4, 0.3, 0.1, 0.8]),
        'dog': np.array([0.5, 0.3, 0.2, 0.7])
    }
    
    # Visualize embeddings with PCA
    words = list(word_vectors.keys())
    embeddings = np.array([word_vectors[word] for word in words])
    
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(embeddings)
    
    plt.figure(figsize=(10, 8))
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.7)
    
    # Add labels
    for i, word in enumerate(words):
        plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]), 
                    fontsize=12, alpha=0.8)
    
    plt.title('PCA of Word Vectors')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Demonstrate vector analogies
    def vector_analogy(a, b, c, word_vectors):
        """Solve analogy: a is to b as c is to ?"""
        # Vector arithmetic: b - a + c
        target_vector = word_vectors[b] - word_vectors[a] + word_vectors[c]
        
        best_match = None
        best_similarity = -1
        
        for word, vector in word_vectors.items():
            if word in [a, b, c]:  # Skip input words
                continue
            
            sim = cosine_similarity([target_vector], [vector])[0][0]
            if sim > best_similarity:
                best_similarity = sim
                best_match = word
        
        return best_match, best_similarity
    
    # Test analogies
    analogies = [
        ('king', 'queen', 'man'),  # king:queen :: man:?
        ('man', 'woman', 'prince'),  # man:woman :: prince:?
        ('uncle', 'aunt', 'king')  # uncle:aunt :: king:?
    ]
    
    print("Vector Analogies:")
    for a, b, c in analogies:
        result, similarity = vector_analogy(a, b, c, word_vectors)
        print(f"{a}:{b} :: {c}:{result} (similarity: {similarity:.4f})")
    
    # Visualize analogies as vectors
    plt.figure(figsize=(12, 10))
    
    # Plot gender direction
    gender_direction = word_vectors['woman'] - word_vectors['man']
    royal_direction = word_vectors['king'] - word_vectors['man']
    
    # Project all words onto these two directions
    x_coords = [np.dot(word_vectors[word], royal_direction) for word in words]
    y_coords = [np.dot(word_vectors[word], gender_direction) for word in words]
    
    plt.scatter(x_coords, y_coords, alpha=0.7)
    
    # Add labels
    for i, word in enumerate(words):
        plt.annotate(word, (x_coords[i], y_coords[i]), 
                    fontsize=12, alpha=0.8)
    
    plt.title('Word Vectors Projected onto Royal and Gender Directions')
    plt.xlabel('Royal Direction')
    plt.ylabel('Gender Direction')
    plt.grid(True, alpha=0.3)
    plt.axhline(y=0, color='k', linewidth=0.5, alpha=0.5)
    plt.axvline(x=0, color='k', linewidth=0.5, alpha=0.5)
    plt.show()
    
    return word_vectors

word_vectors = demonstrate_word2vec_concepts()

### Loading Pre-trained Embeddings

In practice, you would load pre-trained embeddings from libraries like gensim. Here's how you would do it:

In [None]:
# This code is for demonstration purposes
# Uncomment and run if you have gensim installed

# !pip install gensim

# import gensim.downloader as api

# # Load pre-trained Word2Vec embeddings
# word2vec_model = api.load('word2vec-google-news-300')

# # Load pre-trained GloVe embeddings
# glove_model = api.load('glove-wiki-gigaword-100')

# # Example usage
# print("Word2Vec similarity between 'king' and 'queen':", word2vec_model.similarity('king', 'queen'))
# print("GloVe similarity between 'king' and 'queen':", glove_model.similarity('king', 'queen'))

# # Word analogy
# result = word2vec_model.most_similar(positive=['woman', 'king'], negative=['man'])
# print("\nWord2Vec analogy (king - man + woman):", result)

# result = glove_model.most_similar(positive=['woman', 'king'], negative=['man'])
# print("GloVe analogy (king - man + woman):", result)

## 6. Practical Exercises

### Exercise 1: Embedding Dimension Analysis

Let's experiment with different embedding dimensions to understand the trade-offs:

In [None]:
def embedding_dimension_experiment():
    """Experiment with different embedding dimensions."""
    
    vocab_size = 100
    dimensions = [10, 50, 100, 200, 500]
    
    results = {}
    
    for dim in dimensions:
        # Create embedding layer
        embedding = nn.Embedding(vocab_size, dim)
        
        # Compute some statistics
        embeddings = embedding.weight.detach().numpy()
        
        # Memory usage (approximate)
        memory_mb = (vocab_size * dim * 4) / (1024 * 1024)  # 4 bytes per float32
        
        # Average pairwise similarity
        similarities = cosine_similarity(embeddings)
        np.fill_diagonal(similarities, 0)  # Remove self-similarity
        avg_similarity = np.mean(similarities)
        
        results[dim] = {
            'memory_mb': memory_mb,
            'avg_similarity': avg_similarity,
            'parameter_count': vocab_size * dim
        }
    
    # Create DataFrame
    df = pd.DataFrame([
        {'dimension': dim, **stats} 
        for dim, stats in results.items()
    ])
    
    # Print results
    print("Embedding Dimension Analysis:")
    print("Dim | Memory(MB) | Avg Sim | Parameters")
    print("-" * 40)
    
    for dim, stats in results.items():
        print(f"{dim:3d} | {stats['memory_mb']:8.2f} | {stats['avg_similarity']:7.4f} | {stats['parameter_count']:10d}")
    
    # Visualize results
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Memory usage
    ax1.plot(df['dimension'], df['memory_mb'], marker='o', linewidth=2)
    ax1.set_title('Memory Usage vs. Dimension')
    ax1.set_xlabel('Embedding Dimension')
    ax1.set_ylabel('Memory (MB)')
    ax1.grid(True, alpha=0.3)
    
    # Average similarity
    ax2.plot(df['dimension'], df['avg_similarity'], marker='o', linewidth=2, color='green')
    ax2.set_title('Average Similarity vs. Dimension')
    ax2.set_xlabel('Embedding Dimension')
    ax2.set_ylabel('Average Cosine Similarity')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return df

dimension_results = embedding_dimension_experiment()

### Exercise 2: Semantic Clustering

Let's demonstrate how embeddings can capture semantic clusters:

In [None]:
def semantic_clustering_demo():
    """Demonstrate how embeddings can capture semantic clusters."""
    
    # Create word categories
    categories = {
        'animals': ['cat', 'dog', 'bird', 'fish', 'lion'],
        'colors': ['red', 'blue', 'green', 'yellow', 'purple'],
        'numbers': ['one', 'two', 'three', 'four', 'five'],
        'actions': ['run', 'jump', 'swim', 'fly', 'walk']
    }
    
    # Build vocabulary
    vocab = {}
    for category, words in categories.items():
        for word in words:
            vocab[word] = len(vocab)
    
    # Create embeddings (in practice, these would be trained)
    embedding_dim = 20
    embeddings = {}
    
    # Simulate category-aware embeddings
    np.random.seed(42)
    category_centers = {
        'animals': np.random.normal(0, 1, embedding_dim),
        'colors': np.random.normal(2, 1, embedding_dim),
        'numbers': np.random.normal(-2, 1, embedding_dim),
        'actions': np.random.normal(0, 1, embedding_dim) + np.array([0, 3] + [0]*(embedding_dim-2))
    }
    
    for category, words in categories.items():
        center = category_centers[category]
        for word in words:
            # Add noise around category center
            embeddings[word] = center + np.random.normal(0, 0.3, embedding_dim)
    
    # Compute within-category vs between-category similarities
    within_similarities = []
    between_similarities = []
    
    for cat1, words1 in categories.items():
        for word1 in words1:
            for cat2, words2 in categories.items():
                for word2 in words2:
                    if word1 != word2:
                        sim = cosine_similarity([embeddings[word1]], [embeddings[word2]])[0][0]
                        if cat1 == cat2:
                            within_similarities.append(sim)
                        else:
                            between_similarities.append(sim)
    
    print("Semantic Clustering Analysis:")
    print(f"Within-category similarity: {np.mean(within_similarities):.4f} ± {np.std(within_similarities):.4f}")
    print(f"Between-category similarity: {np.mean(between_similarities):.4f} ± {np.std(between_similarities):.4f}")
    
    # Create similarity matrix
    words = list(vocab.keys())
    similarity_matrix = np.zeros((len(words), len(words)))
    
    for i, word1 in enumerate(words):
        for j, word2 in enumerate(words):
            similarity_matrix[i, j] = cosine_similarity([embeddings[word1]], [embeddings[word2]])[0][0]
    
    # Plot similarity matrix
    plt.figure(figsize=(12, 10))
    
    # Reorder words by category for better visualization
    ordered_words = []
    ordered_categories = []
    for category, category_words in categories.items():
        ordered_words.extend(category_words)
        ordered_categories.extend([category] * len(category_words))
    
    # Reorder similarity matrix
    ordered_indices = [words.index(word) for word in ordered_words]
    ordered_matrix = similarity_matrix[np.ix_(ordered_indices, ordered_indices)]
    
    # Create color map for categories
    category_colors = {'animals': 'red', 'colors': 'blue', 'numbers': 'green', 'actions': 'orange'}
    row_colors = [category_colors[cat] for cat in ordered_categories]
    
    # Plot clustered heatmap
    g = sns.clustermap(ordered_matrix, cmap='YlGnBu', 
                      row_colors=row_colors, col_colors=row_colors,
                      xticklabels=ordered_words, yticklabels=ordered_words,
                      figsize=(14, 12))
    plt.title('Semantic Similarity Clustering')
    plt.show()
    
    # Visualize with PCA
    all_embeddings = np.array([embeddings[word] for word in words])
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(all_embeddings)
    
    plt.figure(figsize=(12, 10))
    
    # Plot points by category
    for category, category_words in categories.items():
        indices = [words.index(word) for word in category_words]
        plt.scatter(embeddings_2d[indices, 0], embeddings_2d[indices, 1], 
                   label=category, alpha=0.7)
    
    # Add labels
    for i, word in enumerate(words):
        plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]), 
                    fontsize=12, alpha=0.8)
    
    plt.title('PCA of Semantic Clusters')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return embeddings, within_similarities, between_similarities

clustering_results = semantic_clustering_demo()

## Key Takeaways

1. **One-hot vs. Learned**: Learned embeddings capture semantic relationships, one-hot cannot
2. **Dimensionality**: Balance between expressiveness and computational efficiency
3. **Cosine Similarity**: Measures semantic similarity regardless of vector magnitude
4. **Context Matters**: Embeddings become meaningful when trained with contextual information
5. **Pre-trained Embeddings**: Leverage knowledge from large corpora for downstream tasks
6. **Semantic Clustering**: Similar concepts naturally cluster together in embedding space

## Next Steps

In Day 4, we'll explore positional encodings - how transformers understand the order of tokens in sequences. We'll implement sinusoidal encodings and explore modern alternatives like RoPE.