In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [None]:
def get_embeddings_from_vectorstore():
    """
    Extract embeddings and metadata from the Bengali vector store
    """
    try:
        # Load the embedding model
        embedding = FastEmbedEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
        
        # Load vector store
        vector_store = Chroma(
            persist_directory="./bengali_chroma_db", 
            embedding_function=embedding
        )
        
        # Get all documents from the collection
        collection = vector_store._collection
        
        # Retrieve all embeddings and metadata
        results = collection.get(include=['embeddings', 'metadatas', 'documents'])
        
        embeddings = results['embeddings']
        metadatas = results['metadatas']
        documents = results['documents']
        
        print(f"📊 Retrieved {len(embeddings)} embeddings")
        print(f"📏 Embedding dimension: {len(embeddings[0]) if embeddings else 0}")
        
        return embeddings, metadatas, documents
        
    except Exception as e:
        print(f"❌ Error retrieving embeddings: {e}")
        return None, None, None

def visualize_embeddings_2d(method='umap', sample_size=None):
    """
    Visualize embeddings in 2D using different dimensionality reduction techniques
    
    Args:
        method: 'pca', 'tsne', or 'umap'
        sample_size: Number of embeddings to sample (None for all)
    """
    embeddings, metadatas, documents = get_embeddings_from_vectorstore()
    
    if embeddings is None:
        print("❌ No embeddings found. Please run document ingestion first.")
        return
    
    # Convert to numpy array
    import numpy as np
    embeddings_array = np.array(embeddings)
    
    # Sample if requested
    if sample_size and len(embeddings_array) > sample_size:
        indices = np.random.choice(len(embeddings_array), sample_size, replace=False)
        embeddings_array = embeddings_array[indices]
        metadatas = [metadatas[i] for i in indices]
        documents = [documents[i] for i in indices]
    
    print(f"📈 Visualizing {len(embeddings_array)} embeddings using {method.upper()}")
    
    # Apply dimensionality reduction
    if method.lower() == 'pca':
        reducer = PCA(n_components=2, random_state=42)
        embeddings_2d = reducer.fit_transform(embeddings_array)
        title = f"PCA Visualization of Bengali Text Embeddings"
        
    elif method.lower() == 'tsne':
        reducer = TSNE(n_components=2, random_state=42, perplexity=min(30, len(embeddings_array)-1))
        embeddings_2d = reducer.fit_transform(embeddings_array)
        title = f"t-SNE Visualization of Bengali Text Embeddings"
        
    elif method.lower() == 'umap':
        reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=min(15, len(embeddings_array)-1))
        embeddings_2d = reducer.fit_transform(embeddings_array)
        title = f"UMAP Visualization of Bengali Text Embeddings"
    
    # Create DataFrame for plotting
    df = pd.DataFrame({
        'x': embeddings_2d[:, 0],
        'y': embeddings_2d[:, 1],
        'page': [meta.get('page', 'Unknown') for meta in metadatas],
        'source': [meta.get('source', 'Unknown').split('/')[-1] for meta in metadatas],
        'text_preview': [doc[:100] + '...' if len(doc) > 100 else doc for doc in documents],
        'text_length': [len(doc) for doc in documents]
    })
    
    # Create interactive plot with Plotly
    fig = px.scatter(
        df, 
        x='x', 
        y='y', 
        color='page',
        size='text_length',
        hover_data=['source', 'text_preview'],
        title=title,
        labels={'x': f'{method.upper()} Component 1', 'y': f'{method.upper()} Component 2'},
        color_continuous_scale='viridis'
    )
    
    fig.update_traces(
        hovertemplate='<b>Page:</b> %{color}<br>' +
                      '<b>Source:</b> %{customdata[0]}<br>' +
                      '<b>Text:</b> %{customdata[1]}<br>' +
                      '<b>Length:</b> %{marker.size} chars<br>' +
                      '<extra></extra>'
    )
    
    fig.update_layout(
        width=800,
        height=600,
        showlegend=True
    )
    
    fig.show()
    
    return df, embeddings_2d

def analyze_embedding_clusters():
    """
    Analyze embedding clusters and show statistics
    """
    embeddings, metadatas, documents = get_embeddings_from_vectorstore()
    
    if embeddings is None:
        return
    
    # Convert to DataFrame for analysis
    df = pd.DataFrame({
        'page': [meta.get('page', 'Unknown') for meta in metadatas],
        'source': [meta.get('source', 'Unknown').split('/')[-1] for meta in metadatas],
        'text_length': [len(doc) for doc in documents],
        'word_count': [len(doc.split()) for doc in documents]
    })
    
    print("📊 Embedding Collection Statistics:")
    print("=" * 50)
    print(f"Total chunks: {len(df)}")
    print(f"Pages covered: {df['page'].nunique()}")
    print(f"Sources: {df['source'].nunique()}")
    print(f"Average text length: {df['text_length'].mean():.0f} characters")
    print(f"Average word count: {df['word_count'].mean():.0f} words")
    
    print("\n📄 Chunks per page:")
    page_counts = df['page'].value_counts().sort_index()
    for page, count in page_counts.items():
        print(f"  Page {page}: {count} chunks")
    
    # Plot distributions
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Text length distribution
    axes[0, 0].hist(df['text_length'], bins=20, alpha=0.7, color='skyblue')
    axes[0, 0].set_title('Distribution of Text Length')
    axes[0, 0].set_xlabel('Characters')
    axes[0, 0].set_ylabel('Frequency')
    
    # Word count distribution
    axes[0, 1].hist(df['word_count'], bins=20, alpha=0.7, color='lightgreen')
    axes[0, 1].set_title('Distribution of Word Count')
    axes[0, 1].set_xlabel('Words')
    axes[0, 1].set_ylabel('Frequency')
    
    # Chunks per page
    page_counts.plot(kind='bar', ax=axes[1, 0], color='orange', alpha=0.7)
    axes[1, 0].set_title('Chunks per Page')
    axes[1, 0].set_xlabel('Page Number')
    axes[1, 0].set_ylabel('Number of Chunks')
    axes[1, 0].tick_params(axis='x', rotation=45)
    
    # Text length vs word count scatter
    axes[1, 1].scatter(df['word_count'], df['text_length'], alpha=0.6, color='red')
    axes[1, 1].set_title('Text Length vs Word Count')
    axes[1, 1].set_xlabel('Word Count')
    axes[1, 1].set_ylabel('Character Count')
    
    plt.tight_layout()
    plt.show()
    
    return df

In [None]:
def visualize_query_similarity(query: str, top_k: int = 10):
    """
    Visualize how a query relates to document embeddings
    """
    embeddings, metadatas, documents = get_embeddings_from_vectorstore()
    
    if embeddings is None:
        return
    
    # Load embedding model
    embedding_model = FastEmbedEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    
    # Get query embedding
    query_embedding = embedding_model.embed_query(query)
    
    # Calculate similarities
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    
    embeddings_array = np.array(embeddings)
    query_array = np.array(query_embedding).reshape(1, -1)
    
    similarities = cosine_similarity(query_array, embeddings_array)[0]
    
    # Create DataFrame with similarities
    df = pd.DataFrame({
        'similarity': similarities,
        'page': [meta.get('page', 'Unknown') for meta in metadatas],
        'source': [meta.get('source', 'Unknown').split('/')[-1] for meta in metadatas],
        'text_preview': [doc[:150] + '...' if len(doc) > 150 else doc for doc in documents],
        'text_length': [len(doc) for doc in documents]
    })
    
    # Sort by similarity
    df = df.sort_values('similarity', ascending=False)
    
    print(f"🔍 Query: '{query}'")
    print("=" * 60)
    print(f"📊 Top {top_k} most similar chunks:")
    print("-" * 60)
    
    for i, row in df.head(top_k).iterrows():
        print(f"{i+1}. Similarity: {row['similarity']:.4f} | Page: {row['page']}")
        print(f"   Text: {row['text_preview']}")
        print()
    
    # Create visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Similarity distribution
    ax1.hist(similarities, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    ax1.axvline(similarities.mean(), color='red', linestyle='--', label=f'Mean: {similarities.mean():.3f}')
    ax1.axvline(np.percentile(similarities, 95), color='orange', linestyle='--', label='95th percentile')
    ax1.set_title(f'Similarity Distribution for Query: "{query}"')
    ax1.set_xlabel('Cosine Similarity')
    ax1.set_ylabel('Frequency')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Top similarities by page
    page_max_sim = df.groupby('page')['similarity'].max().sort_values(ascending=False)
    page_max_sim.head(10).plot(kind='bar', ax=ax2, color='lightgreen', alpha=0.7)
    ax2.set_title('Highest Similarity by Page')
    ax2.set_xlabel('Page Number')
    ax2.set_ylabel('Max Similarity Score')
    ax2.tick_params(axis='x', rotation=45)
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return df.head(top_k)

def compare_embedding_methods():
    """
    Compare different dimensionality reduction methods side by side
    """
    embeddings, metadatas, documents = get_embeddings_from_vectorstore()
    
    if embeddings is None:
        return
    
    import numpy as np
    embeddings_array = np.array(embeddings)
    
    # Sample if too many embeddings
    if len(embeddings_array) > 200:
        indices = np.random.choice(len(embeddings_array), 200, replace=False)
        embeddings_array = embeddings_array[indices]
        metadatas = [metadatas[i] for i in indices]
        documents = [documents[i] for i in indices]
    
    # Apply different reduction methods
    methods = {
        'PCA': PCA(n_components=2, random_state=42),
        't-SNE': TSNE(n_components=2, random_state=42, perplexity=min(30, len(embeddings_array)-1)),
        'UMAP': umap.UMAP(n_components=2, random_state=42, n_neighbors=min(15, len(embeddings_array)-1))
    }
    
    fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=list(methods.keys()),
        specs=[[{'type': 'scatter'}, {'type': 'scatter'}, {'type': 'scatter'}]]
    )
    
    colors = [meta.get('page', 0) for meta in metadatas]
    
    for i, (name, method) in enumerate(methods.items(), 1):
        print(f"Computing {name}...")
        reduced = method.fit_transform(embeddings_array)
        
        fig.add_trace(
            go.Scatter(
                x=reduced[:, 0],
                y=reduced[:, 1],
                mode='markers',
                marker=dict(
                    color=colors,
                    colorscale='viridis',
                    size=8,
                    opacity=0.7
                ),
                text=[f"Page: {meta.get('page', 'Unknown')}<br>Text: {doc[:100]}..." 
                      for meta, doc in zip(metadatas, documents)],
                hovertemplate='%{text}<extra></extra>',
                name=name
            ),
            row=1, col=i
        )
    
    fig.update_layout(
        title_text="Comparison of Dimensionality Reduction Methods",
        showlegend=False,
        height=500,
        width=1500
    )
    
    fig.show()
    
    print("✅ Comparison visualization complete!")

In [None]:
# 📊 Analyze embedding collection statistics
analyze_embedding_clusters()

In [None]:
# 🎯 Visualize embeddings in 2D using UMAP (recommended for large datasets)
visualize_embeddings_2d(method='umap', sample_size=100)

In [None]:
# 🔍 Visualize how a specific query relates to document embeddings
visualize_query_similarity("বিয়ের সময় কল্যাণীর বয়স", top_k=5)

In [None]:
# 📈 Compare different dimensionality reduction methods side by side
compare_embedding_methods()