# Plant Document Embeddings Visualization

This notebook visualizes the plant document embeddings from the Chroma database.


In [None]:
# Import libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.decomposition import PCA
import umap

# Configure visualization
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)


In [None]:
# Load Chroma database
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Find Chroma path
chroma_path = os.path.join(os.getcwd(), "PlantRagChatbot", "chroma")
if not os.path.exists(chroma_path):
    # Maybe we're already in the PlantRagChatbot directory
    chroma_path = os.path.join(os.getcwd(), "chroma")

print(f"Loading database from: {chroma_path}")
print(f"Database exists: {os.path.exists(chroma_path)}")

# Load the database
db = Chroma(persist_directory=chroma_path, embedding_function=embeddings)

# CRITICAL: We must explicitly include embeddings in the get request
documents = db.get(include=['embeddings', 'documents', 'metadatas'])

print(f"Loaded {len(documents['ids'])} documents")
print(f"Available keys: {list(documents.keys())}")
print(f"Embeddings included: {'embeddings' in documents}")
# Debug: Check if embeddings are actually there
if 'embeddings' in documents:
    print(f"Embeddings count: {len(documents['embeddings'])}")
    has_values = any(e is not None and len(e) > 0 for e in documents['embeddings'][:5])
    print(f"Embeddings have values: {has_values}")


In [None]:
# Extract data from documents
# Create a DataFrame with document info
df = pd.DataFrame({
    'id': documents['ids']
})

# Add text content
if 'documents' in documents:
    df['text'] = documents['documents']
else:
    df['text'] = ['No content available'] * len(documents['ids'])

# Add metadata - especially plant types
if 'metadatas' in documents and documents['metadatas']:
    # Extract plant types and sources
    plant_types = [m.get('plant_type', 'unknown') for m in documents['metadatas']]
    sources = [m.get('source', '') for m in documents['metadatas']]
    
    df['plant_type'] = plant_types
    df['source'] = sources
    
    # Show plant type distribution
    plant_counts = df['plant_type'].value_counts()
    print(f"\nPlant type distribution ({len(plant_counts)} types):")
    print(plant_counts)
    
    # Show a bar chart
    plt.figure(figsize=(12, 6))
    sns.barplot(x=plant_counts.index, y=plant_counts.values)
    plt.title('Number of Documents by Plant Type')
    plt.xlabel('Plant Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# Process embeddings for visualization
if 'embeddings' in documents and documents['embeddings'] and len(documents['embeddings']) > 0:
    print("Processing embeddings...")
    
    # Check if embeddings are None or empty
    if all(e is None or len(e) == 0 for e in documents['embeddings'][:5]):
        print("ERROR: Embeddings are present but contain empty values")
        
        # Try recreating embeddings using the loaded data
        print("Recreating embeddings from documents...")
        texts = documents['documents']
        
        # Create new embeddings directly
        new_embeddings = []
        for i, text in enumerate(texts):
            if i % 100 == 0:
                print(f"Processing {i}/{len(texts)}...")
            vector = embeddings.embed_query(text)
            new_embeddings.append(vector)
        
        # Replace the empty embeddings
        documents['embeddings'] = new_embeddings
    
    # Convert to numpy array
    embeddings_raw = documents['embeddings']
    
    # Check first embedding to understand format
    print(f"First embedding type: {type(embeddings_raw[0])}")
    print(f"First embedding length: {len(embeddings_raw[0])}")
    
    # Convert to proper numpy array
    embeddings_array = np.array(embeddings_raw)
    
    # Check if we need to convert from 1D array of arrays to 2D array
    if embeddings_array.ndim == 1:
        print("Converting to 2D array...")
        embeddings_array = np.vstack(embeddings_array)
    
    print(f"Embeddings shape: {embeddings_array.shape}")
    
    # Reduce dimensions with UMAP for visualization
    # First use PCA if dimension is too high
    if embeddings_array.shape[1] > 50:
        pca = PCA(n_components=50)
        embeddings_reduced = pca.fit_transform(embeddings_array)
        print(f"Reduced with PCA from {embeddings_array.shape[1]} to 50 dimensions")
    else:
        embeddings_reduced = embeddings_array
        
    # Apply UMAP for final 2D visualization
    n_neighbors = min(30, len(embeddings_reduced) - 1)
    reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.1, n_components=2, random_state=42)
    embedding_2d = reducer.fit_transform(embeddings_reduced)
    
    # Add coordinates to DataFrame
    df['x'] = embedding_2d[:, 0]
    df['y'] = embedding_2d[:, 1]
    
    print("Visualization coordinates created successfully")
else:
    print("\nERROR: No embeddings found in documents!")


In [None]:
# Create interactive visualization
if 'x' in df.columns and 'y' in df.columns:
    # Create plotly visualization
    fig = px.scatter(
        df, x='x', y='y',
        color='plant_type',
        hover_name='plant_type',
        hover_data=['source'],
        title='Plant Document Embeddings',
        labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'},
        height=800, width=1000
    )
    
    # Update layout
    fig.update_layout(
        legend_title_text='Plant Type',
        title_font_size=20,
        legend=dict(orientation="h", yanchor="bottom", y=-0.1, xanchor="center", x=0.5)
    )
    
    # Show the plot
    fig.show()
else:
    print("Cannot create visualization - coordinates not available")


In [None]:
# Create widget to explore documents by plant type
from IPython.display import display
import ipywidgets as widgets

# Only if we have plant types
if 'plant_type' in df.columns:
    # Create dropdown for plant types
    plant_selector = widgets.Dropdown(
        options=sorted(df['plant_type'].unique()),
        description='Plant Type:',
        disabled=False,
    )
    
    # Output area
    output = widgets.Output()
    
    # Display function
    def show_plant_docs(plant):
        with output:
            output.clear_output()
            
            # Get documents for this plant
            plant_docs = df[df['plant_type'] == plant]
            
            print(f"\nFound {len(plant_docs)} documents for plant type: {plant}")
            
            # Show document sources
            sources = plant_docs['source'].unique()
            print(f"\nSources ({len(sources)}):")
            for i, source in enumerate(sources[:5]):
                print(f"  {i+1}. {os.path.basename(source)}")
            
            if len(sources) > 5:
                print(f"  ... and {len(sources) - 5} more")
            
            # Show sample text
            if len(plant_docs) > 0:
                print("\nSample text:")
                for i, text in enumerate(plant_docs['text'].iloc[:3]):
                    print(f"\nDocument {i+1}:")
                    print(text[:300] + "..." if len(text) > 300 else text)
    
    # Connect dropdown to function
    def on_change(change):
        show_plant_docs(change.new)
    
    plant_selector.observe(on_change, names='value')
    
    # Display
    display(plant_selector)
    display(output)
    
    # Show initial plant
    show_plant_docs(plant_selector.options[0])
