# Embeddings Analysis

Examples for loading and analyzing embedding runs, including similarity calculations and visualizations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from bedrock_benchmark.core import BenchmarkCore
from bedrock_benchmark.storage import StorageManager

# Initialize
storage_manager = StorageManager('./experiments')
benchmark_core = BenchmarkCore(storage_manager)

## List Embedding Experiments and Runs

In [None]:
# List all experiments
experiments = benchmark_core.list_experiments()
for exp in experiments:
    print(f"{exp.id}: {exp.name} ({len(benchmark_core.list_runs(exp.id))} runs)")

# Focus on embedding experiments
embedding_experiments = [exp for exp in experiments if 'embedding' in exp.id.lower() or 'embed' in exp.name.lower()]

if embedding_experiments:
    experiment_id = embedding_experiments[0].id
    runs = benchmark_core.list_runs(experiment_id)
    print(f"\nRuns in {experiment_id}:")
    for run_id in runs:
        summary = benchmark_core.get_run_summary(run_id)
        print(f"  {run_id}: {summary['model_id']} - {summary['total_responses']} embeddings")

## Load Embedding Run

In [None]:
# Load first embedding run
if runs:
    run_id = runs[0]
    
    # Get run configuration to extract dataset path
    run_config = storage_manager.get_run_config(run_id)
    
    if run_config and run_config.dataset_path:
        dataset_path = run_config.dataset_path
        print(f"Dataset path from config: {dataset_path}")
        print(f"Model: {run_config.model_id}")
        print(f"Model params: {run_config.model_params}")
    
    # Export run to get embeddings
    df = benchmark_core.export_run_to_dataframe(run_id)
    
    print(f"\nLoaded run {run_id}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Show first few rows (without showing full embeddings)
    display(df[['run_id', 'item_id', 'model_id', 'latency_ms']].head())
    
    # Basic stats
    print(f"\nAverage latency: {df['latency_ms'].mean():.2f}ms")
    print(f"Embedding dimension: {len(df['embedding'].iloc[0]) if 'embedding' in df.columns else 'N/A'}")

## Extract Embeddings as NumPy Array

In [None]:
# Convert embeddings to numpy array for analysis
if 'embedding' in df.columns:
    embeddings = np.array(df['embedding'].tolist())
    print(f"Embeddings shape: {embeddings.shape}")
    print(f"Embedding dimension: {embeddings.shape[1]}")
    print(f"Number of embeddings: {embeddings.shape[0]}")

## Calculate Cosine Similarity Matrix

In [None]:
# Calculate pairwise cosine similarity
if 'embeddings' in locals():
    similarity_matrix = cosine_similarity(embeddings)
    
    print(f"Similarity matrix shape: {similarity_matrix.shape}")
    print(f"\nSimilarity statistics:")
    print(f"  Mean similarity: {similarity_matrix.mean():.4f}")
    print(f"  Std similarity: {similarity_matrix.std():.4f}")
    print(f"  Min similarity: {similarity_matrix.min():.4f}")
    print(f"  Max similarity: {similarity_matrix.max():.4f}")
    
    # Create a DataFrame for better visualization
    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=df['item_id'].values,
        columns=df['item_id'].values
    )
    
    print("\nSimilarity matrix (first 5x5):")
    display(similarity_df.iloc[:5, :5].round(3))

## Visualize Similarity Matrix

In [None]:
# Heatmap of similarity matrix
if 'similarity_matrix' in locals():
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        similarity_df,
        cmap='coolwarm',
        center=0.5,
        vmin=0,
        vmax=1,
        square=True,
        linewidths=0.5,
        cbar_kws={"shrink": 0.8}
    )
    plt.title('Cosine Similarity Heatmap')
    plt.tight_layout()
    plt.show()

## Find Most Similar Items

In [None]:
# Find most similar pairs (excluding self-similarity)
if 'similarity_matrix' in locals():
    # Set diagonal to -1 to exclude self-similarity
    sim_no_diag = similarity_matrix.copy()
    np.fill_diagonal(sim_no_diag, -1)
    
    # Find top 5 most similar pairs
    print("Top 5 most similar item pairs:\n")
    for i in range(5):
        max_idx = np.unravel_index(sim_no_diag.argmax(), sim_no_diag.shape)
        similarity = sim_no_diag[max_idx]
        item1 = df['item_id'].iloc[max_idx[0]]
        item2 = df['item_id'].iloc[max_idx[1]]
        
        print(f"{i+1}. {item1} <-> {item2}: {similarity:.4f}")
        
        # Set to -1 to find next highest
        sim_no_diag[max_idx] = -1
        sim_no_diag[max_idx[1], max_idx[0]] = -1  # Also set symmetric entry

## 2D Visualization with t-SNE

In [None]:
# Reduce embeddings to 2D for visualization
if 'embeddings' in locals() and len(embeddings) > 3:
    # Use t-SNE for dimensionality reduction
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(embeddings)-1))
    embeddings_2d = tsne.fit_transform(embeddings)
    
    # Create visualization
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(
        embeddings_2d[:, 0],
        embeddings_2d[:, 1],
        c=range(len(embeddings)),
        cmap='viridis',
        s=100,
        alpha=0.6
    )
    
    # Add labels
    for i, item_id in enumerate(df['item_id']):
        plt.annotate(
            item_id,
            (embeddings_2d[i, 0], embeddings_2d[i, 1]),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=8,
            alpha=0.7
        )
    
    plt.colorbar(scatter, label='Item Index')
    plt.title('Embedding Space Visualization (t-SNE)')
    plt.xlabel('t-SNE Dimension 1')
    plt.ylabel('t-SNE Dimension 2')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Need at least 4 embeddings for t-SNE visualization")

## Compare Multiple Embedding Models

In [None]:
# Load multiple runs for comparison
if len(runs) >= 2:
    comparison_data = []
    
    for run_id in runs[:3]:  # Compare up to 3 runs
        df_run = benchmark_core.export_run_to_dataframe(run_id)
        run_config = storage_manager.get_run_config(run_id)
        
        comparison_data.append({
            'run_id': run_id,
            'model': run_config.model_id if run_config else 'Unknown',
            'num_embeddings': len(df_run),
            'avg_latency_ms': df_run['latency_ms'].mean(),
            'embedding_dim': len(df_run['embedding'].iloc[0]) if 'embedding' in df_run.columns else 0
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    print("\nModel Comparison:")
    display(comparison_df)
    
    # Visualize latency comparison
    plt.figure(figsize=(10, 5))
    plt.bar(range(len(comparison_df)), comparison_df['avg_latency_ms'])
    plt.xticks(range(len(comparison_df)), comparison_df['model'], rotation=45, ha='right')
    plt.ylabel('Average Latency (ms)')
    plt.title('Embedding Generation Latency by Model')
    plt.tight_layout()
    plt.show()
else:
    print(f"Only {len(runs)} run available - need 2+ for comparison")

## Semantic Search Example

In [None]:
# Example: Find most similar items to a query item
if 'similarity_matrix' in locals() and len(df) > 0:
    query_idx = 0  # Use first item as query
    query_item = df['item_id'].iloc[query_idx]
    
    # Get similarities to query
    similarities = similarity_matrix[query_idx]
    
    # Sort by similarity (excluding self)
    similar_indices = np.argsort(similarities)[::-1][1:6]  # Top 5, excluding self
    
    print(f"Query item: {query_item}")
    print(f"\nTop 5 most similar items:\n")
    
    for rank, idx in enumerate(similar_indices, 1):
        similar_item = df['item_id'].iloc[idx]
        similarity = similarities[idx]
        print(f"{rank}. {similar_item}: {similarity:.4f}")

## Export Results

In [None]:
# Save embeddings and similarity matrix
if 'df' in locals():
    # Save embeddings DataFrame
    df.to_csv('embeddings_data.csv', index=False)
    print("Saved embeddings to: embeddings_data.csv")

if 'similarity_df' in locals():
    # Save similarity matrix
    similarity_df.to_csv('similarity_matrix.csv')
    print("Saved similarity matrix to: similarity_matrix.csv")

if 'embeddings' in locals():
    # Save embeddings as numpy array
    np.save('embeddings.npy', embeddings)
    print("Saved embeddings array to: embeddings.npy")