# Day 3: Embeddings - Part 2: Cosine Similarity

In this notebook, we'll explore cosine similarity in depth and how it's used to measure semantic relationships between word embeddings.

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import seaborn as sns

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 3. Cosine Similarity Deep Dive

Cosine similarity measures the cosine of the angle between two vectors, focusing on direction rather than magnitude. It's a key metric for comparing embeddings.

In [None]:
def visualize_cosine_similarity():
    """Visualize cosine similarity with 2D examples."""
    
    # Create some 2D vectors for visualization
    vectors = {
        'A': np.array([1, 0]),
        'B': np.array([1, 1]),
        'C': np.array([0, 1]),
        'D': np.array([-1, 0]),
        'E': np.array([2, 0])  # Same direction as A, different magnitude
    }
    
    # Compute all pairwise similarities
    similarities = {}
    for name1, vec1 in vectors.items():
        for name2, vec2 in vectors.items():
            sim = cosine_similarity([vec1], [vec2])[0][0]
            similarities[(name1, name2)] = sim
    
    # Print similarity matrix
    print("Cosine Similarity Matrix:")
    print("    ", end="")
    for name in vectors.keys():
        print(f"{name:6s}", end="")
    print()
    
    for name1 in vectors.keys():
        print(f"{name1}: ", end="")
        for name2 in vectors.keys():
            sim = similarities[(name1, name2)]
            print(f"{sim:6.2f}", end="")
        print()
    
    # Visualize vectors
    plt.figure(figsize=(10, 8))
    colors = ['red', 'blue', 'green', 'orange', 'purple']
    
    for i, (name, vec) in enumerate(vectors.items()):
        plt.arrow(0, 0, vec[0], vec[1], head_width=0.1, head_length=0.1, 
                 fc=colors[i], ec=colors[i], label=f'Vector {name}')
        plt.text(vec[0]*1.1, vec[1]*1.1, name, fontsize=12, fontweight='bold')
    
    # Add angle annotations
    def add_angle_annotation(vec1_name, vec2_name, radius=0.5):
        vec1 = vectors[vec1_name]
        vec2 = vectors[vec2_name]
        
        # Normalize vectors
        vec1_norm = vec1 / np.linalg.norm(vec1)
        vec2_norm = vec2 / np.linalg.norm(vec2)
        
        # Calculate angle
        angle = np.arccos(np.clip(np.dot(vec1_norm, vec2_norm), -1.0, 1.0))
        angle_deg = np.degrees(angle)
        
        # Draw arc
        theta1 = np.arctan2(vec1[1], vec1[0])
        theta2 = np.arctan2(vec2[1], vec2[0])
        
        # Ensure we draw the smaller angle
        if abs(theta2 - theta1) > np.pi:
            if theta1 < theta2:
                theta1 += 2 * np.pi
            else:
                theta2 += 2 * np.pi
                
        # Draw arc
        theta = np.linspace(min(theta1, theta2), max(theta1, theta2), 100)
        x = radius * np.cos(theta)
        y = radius * np.sin(theta)
        plt.plot(x, y, 'k--', alpha=0.3)
        
        # Add angle text at midpoint
        mid_theta = (theta1 + theta2) / 2
        plt.text(radius * 1.2 * np.cos(mid_theta), 
                 radius * 1.2 * np.sin(mid_theta), 
                 f"{angle_deg:.1f}°", fontsize=10)
        
        return angle_deg
    
    # Add some angle annotations
    add_angle_annotation('A', 'B')
    add_angle_annotation('B', 'C')
    add_angle_annotation('A', 'C')
    add_angle_annotation('A', 'D')
    
    plt.xlim(-1.5, 2.5)
    plt.ylim(-1.5, 1.5)
    plt.grid(True, alpha=0.3)
    plt.axhline(y=0, color='k', linewidth=0.5)
    plt.axvline(x=0, color='k', linewidth=0.5)
    plt.title('Vector Visualization for Cosine Similarity')
    plt.legend(loc='upper left')
    plt.axis('equal')  # Equal scale
    plt.show()
    
    # Create a heatmap of similarities
    sim_matrix = np.zeros((len(vectors), len(vectors)))
    for i, name1 in enumerate(vectors.keys()):
        for j, name2 in enumerate(vectors.keys()):
            sim_matrix[i, j] = similarities[(name1, name2)]
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(sim_matrix, annot=True, fmt='.2f', cmap='YlGnBu',
                xticklabels=vectors.keys(), yticklabels=vectors.keys())
    plt.title('Cosine Similarity Matrix')
    plt.show()
    
    return similarities

# Run visualization
similarities = visualize_cosine_similarity()

### Advanced Similarity Measures

Let's compare cosine similarity with other similarity measures:

In [None]:
def compute_similarity_measures(emb1, emb2):
    """Compute various similarity measures between embeddings."""
    
    # Cosine similarity
    cos_sim = cosine_similarity([emb1], [emb2])[0][0]
    
    # Euclidean distance (lower = more similar)
    euclidean_dist = np.linalg.norm(emb1 - emb2)
    
    # Manhattan distance
    manhattan_dist = np.sum(np.abs(emb1 - emb2))
    
    # Dot product (unnormalized)
    dot_product = np.dot(emb1, emb2)
    
    return {
        'cosine_similarity': cos_sim,
        'euclidean_distance': euclidean_dist,
        'manhattan_distance': manhattan_dist,
        'dot_product': dot_product
    }

# Generate random embeddings for demonstration
np.random.seed(42)  # For reproducibility
emb_dim = 50

# Create random embeddings
emb1 = np.random.normal(0, 1, emb_dim)
emb2 = np.random.normal(0, 1, emb_dim)  # Completely different
emb3 = emb1 + 0.1 * np.random.normal(0, 1, emb_dim)  # Similar to emb1
emb4 = -emb1 + 0.1 * np.random.normal(0, 1, emb_dim)  # Opposite to emb1

# Compare similarity measures
print("Random vs Random:")
sim_random = compute_similarity_measures(emb1, emb2)
for measure, value in sim_random.items():
    print(f"{measure}: {value:.4f}")

print("\nSimilar Embeddings:")
sim_similar = compute_similarity_measures(emb1, emb3)
for measure, value in sim_similar.items():
    print(f"{measure}: {value:.4f}")

print("\nOpposite Embeddings:")
sim_opposite = compute_similarity_measures(emb1, emb4)
for measure, value in sim_opposite.items():
    print(f"{measure}: {value:.4f}")

### Visualizing Similarity in High Dimensions

Let's create a visualization to understand how similarity behaves in high-dimensional spaces:

In [None]:
def visualize_high_dim_similarity(dimensions, num_vectors=1000):
    """Visualize how similarity behaves in high-dimensional spaces."""
    results = []
    
    for dim in dimensions:
        # Generate random unit vectors
        vectors = np.random.normal(0, 1, (num_vectors, dim))
        
        # Normalize to unit length
        norms = np.linalg.norm(vectors, axis=1, keepdims=True)
        vectors = vectors / norms
        
        # Compute pairwise similarities for a subset
        subset_size = min(100, num_vectors)  # Limit computation for large sets
        subset = vectors[:subset_size]
        similarities = cosine_similarity(subset)
        
        # Remove self-similarities
        np.fill_diagonal(similarities, np.nan)
        similarities = similarities.flatten()
        similarities = similarities[~np.isnan(similarities)]
        
        # Compute statistics
        results.append({
            'dimension': dim,
            'mean_similarity': np.mean(similarities),
            'std_similarity': np.std(similarities),
            'min_similarity': np.min(similarities),
            'max_similarity': np.max(similarities),
            'similarities': similarities
        })
    
    # Plot results
    plt.figure(figsize=(12, 5))
    
    # Plot mean similarity vs dimension
    plt.subplot(1, 2, 1)
    plt.plot([r['dimension'] for r in results], 
             [r['mean_similarity'] for r in results], 
             'o-', linewidth=2)
    plt.fill_between([r['dimension'] for r in results],
                     [r['mean_similarity'] - r['std_similarity'] for r in results],
                     [r['mean_similarity'] + r['std_similarity'] for r in results],
                     alpha=0.2)
    plt.title('Mean Cosine Similarity vs Dimension')
    plt.xlabel('Dimension')
    plt.ylabel('Mean Similarity')
    plt.grid(True, alpha=0.3)
    
    # Plot distribution for selected dimensions
    plt.subplot(1, 2, 2)
    for i, dim in enumerate([dimensions[0], dimensions[-1]]):
        result = next(r for r in results if r['dimension'] == dim)
        sns.histplot(result['similarities'], kde=True, 
                    label=f'Dim={dim}', alpha=0.5)
    
    plt.title('Distribution of Cosine Similarities')
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return results

# Visualize similarity in different dimensions
dimensions = [2, 5, 10, 50, 100, 300, 1000]
high_dim_results = visualize_high_dim_similarity(dimensions)

## Key Insights about Cosine Similarity

1. **Direction vs. Magnitude**: Cosine similarity focuses on the angle between vectors, ignoring their magnitudes
2. **Range**: Values range from -1 (opposite directions) to 1 (same direction), with 0 indicating orthogonality
3. **High Dimensions**: In high-dimensional spaces, random vectors tend to be nearly orthogonal
4. **Semantic Meaning**: In word embeddings, cosine similarity captures semantic relationships
5. **Efficiency**: Computationally efficient for sparse high-dimensional vectors