# Cluster chunks of AI conversations

In [3]:
import duckdb

conn = duckdb.connect("augmcp_v0.duckdb")



In [4]:
# Step 1: Pull embeddings from DuckDB
import numpy as np
import pandas as pd
import faiss
import math
import plotly.express as px
import plotly.graph_objects as go
from umap import UMAP
import hdbscan
from datetime import datetime

# Get the full dataset with embeddings
query = """
SELECT
  doc_id,
  thread_id,
  thread_name,
  role,
  content,
  embedding,
  ts_ingest
FROM vw_chunks_with_name
"""

df = conn.execute(query).df()
print(f"Total chunks: {len(df)}")
print(f"Sample of data:")
print(df[['doc_id', 'thread_name', 'role', 'ts_ingest']].head())


Total chunks: 11284
Sample of data:
                                              doc_id  \
0  b4fa5efd-9e1c-4e4b-a8fe-ae1dba7ef312-003-17169...   
1  b4fa5efd-9e1c-4e4b-a8fe-ae1dba7ef312-000-17169...   
2  b4fa5efd-9e1c-4e4b-a8fe-ae1dba7ef312-004-17169...   
3  b4fa5efd-9e1c-4e4b-a8fe-ae1dba7ef312-002-17169...   
4  b4fa5efd-9e1c-4e4b-a8fe-ae1dba7ef312-001-17169...   

                                 thread_name       role  \
0  Monetizing Creativity with Blockchain AGI  assistant   
1  Monetizing Creativity with Blockchain AGI      human   
2  Monetizing Creativity with Blockchain AGI      human   
3  Monetizing Creativity with Blockchain AGI  assistant   
4  Monetizing Creativity with Blockchain AGI      human   

                   ts_ingest  
0 2024-05-28 09:07:32.757438  
1 2024-05-28 09:07:32.757438  
2 2024-05-28 09:11:21.515193  
3 2024-05-28 09:06:00.009748  
4 2024-05-28 09:06:00.009748  


In [5]:
# Extract embeddings matrix and IDs
E = np.stack(df['embedding'].values)  # (N, 1536) float32
ids = df['doc_id'].values
print(f"Embeddings shape: {E.shape}")
print(f"Embeddings dtype: {E.dtype}")

# Create a metadata dataframe for later use
metadata_df = df[['doc_id', 'thread_id', 'thread_name', 'role', 'content', 'ts_ingest']].copy()
metadata_df['content_preview'] = metadata_df['content'].str[:200] + '...'
print(f"Metadata shape: {metadata_df.shape}")


Embeddings shape: (11284, 1536)
Embeddings dtype: float64
Metadata shape: (11284, 7)


In [6]:
# Step 2: Build a FAISS k-NN index
d = E.shape[1]
print(f"Embedding dimension: {d}")

# Normalize for cosine similarity
E_norm = E.copy().astype('float32')
faiss.normalize_L2(E_norm)

# Create HNSW index for fast nearest neighbor search
index = faiss.IndexHNSWFlat(d, 32)  # 32 is the number of connections
index.add(E_norm)

print(f"FAISS index built with {index.ntotal} vectors")

# Test the index with a query
distances, indices = index.search(E_norm[:1], 5)  # Find 5 nearest neighbors of first vector
print(f"Sample nearest neighbors for first vector: {indices[0]}")
print(f"Sample distances: {distances[0]}")


Embedding dimension: 1536
FAISS index built with 11284 vectors
Sample nearest neighbors for first vector: [  0 918 761 922 763]
Sample distances: [0.         0.43833473 0.44525683 0.45389473 0.49467307]


In [7]:
# Step 3: Derive clusters

# Option A: FAISS k-means
k = round(math.sqrt(len(E)))  # √N heuristic
print(f"Using k={k} clusters (√N heuristic)")

# Run k-means clustering
km = faiss.Kmeans(d, k, niter=25, nredo=2, verbose=True)
km.train(E_norm)
_, labels_kmeans = km.index.search(E_norm, 1)
labels_kmeans = labels_kmeans.flatten()

print(f"K-means clustering complete")
print(f"Number of clusters: {len(np.unique(labels_kmeans))}")
print(f"Cluster distribution: {np.bincount(labels_kmeans)}")

# Option B: HDBSCAN (for comparison)
print("\nRunning HDBSCAN for comparison...")
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean')
labels_hdbscan = clusterer.fit_predict(E_norm)

print(f"HDBSCAN clustering complete")
print(f"Number of clusters (including noise): {len(np.unique(labels_hdbscan))}")
print(f"Number of noise points: {np.sum(labels_hdbscan == -1)}")

# Use k-means labels as the primary clustering approach
labels = labels_kmeans
print(f"\nUsing k-means labels as primary clustering approach")


Using k=106 clusters (√N heuristic)
Clustering 11284 points in 1536D to 106 clusters, redo 2 times, 25 iterations
  Preprocessing in 0.01 s
Outer iteration 0 / 2
  Iteration 24 (1.01 s, search 0.93 s): objective=5681.04 imbalance=1.294 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 2
  Iteration 24 (2.04 s, search 1.88 s): objective=5685.63 imbalance=1.246 nsplit=0       
K-means clustering complete
Number of clusters: 106
Cluster distribution: [120  80 210 122 121 179 141  79 131 123  50 145  78 182 248 106  56  83
  17 170  43 124  92  80  76  17  75 176 164 150  38  44  45  86 147  59
  79 190  25  77  92  58 121 112 129 184 258  15  37  92  93  81  21 141
  68 240  59 204  98  29 226 122  51  64 112  59 113 101 154 264 133  51
 115  48  64  96 120 164 138 136  87  70 147  56  30 103  21 123 110  68
  95  46  92 109  82  41 199 143  63  69 220 189  98 197  52  83]

Running HDBSCAN for comparison...




HDBSCAN clustering complete
Number of clusters (including noise): 89
Number of noise points: 9772

Using k-means labels as primary clustering approach


In [10]:
# Step 4: Persist cluster labels in DuckDB (both k-means and HDBSCAN)
# **FIX: Convert numpy int64 to Python native integers for DuckDB**

# Drop existing table if it exists
conn.execute("DROP TABLE IF EXISTS chunk_cluster")

# Create the cluster table with cluster_type column
conn.execute("""
CREATE TABLE chunk_cluster (
    doc_id TEXT,
    cluster INT,
    cluster_type TEXT
)
""")

# Insert k-means cluster labels - FIXED: Convert numpy int64 to Python int
kmeans_data = list(zip(ids, [int(x) for x in labels_kmeans], ['kmeans'] * len(ids)))
conn.executemany(
    "INSERT INTO chunk_cluster VALUES (?, ?, ?)",
    kmeans_data
)

# Insert HDBSCAN cluster labels - FIXED: Convert numpy int64 to Python int  
hdbscan_data = list(zip(ids, [int(x) for x in labels_hdbscan], ['hdbscan'] * len(ids)))
conn.executemany(
    "INSERT INTO chunk_cluster VALUES (?, ?, ?)",
    hdbscan_data
)

print(f"Inserted {len(kmeans_data)} k-means cluster labels into DuckDB")
print(f"Inserted {len(hdbscan_data)} HDBSCAN cluster labels into DuckDB")

# Verify the insertion
verification = conn.execute("""
SELECT cluster_type, cluster, COUNT(*) as count 
FROM chunk_cluster 
GROUP BY cluster_type, cluster 
ORDER BY cluster_type, cluster
""").df()

print("\nCluster distribution in DuckDB:")
print(verification.head(15))

# Summary by cluster type
summary = conn.execute("""
SELECT 
    cluster_type,
    COUNT(DISTINCT cluster) as num_clusters,
    COUNT(*) as total_chunks
FROM chunk_cluster 
GROUP BY cluster_type
""").df()

print("\nClustering method comparison:")
print(summary)


Inserted 11284 k-means cluster labels into DuckDB
Inserted 11284 HDBSCAN cluster labels into DuckDB

Cluster distribution in DuckDB:
   cluster_type  cluster  count
0       hdbscan       -1   9772
1       hdbscan        0      5
2       hdbscan        1      7
3       hdbscan        2     12
4       hdbscan        3     12
5       hdbscan        4      7
6       hdbscan        5      5
7       hdbscan        6      5
8       hdbscan        7      7
9       hdbscan        8      8
10      hdbscan        9      7
11      hdbscan       10      7
12      hdbscan       11     28
13      hdbscan       12      6
14      hdbscan       13     14

Clustering method comparison:
  cluster_type  num_clusters  total_chunks
0      hdbscan            89         11284
1       kmeans           106         11284


In [11]:
# Step 6: Create 2D visualization with UMAP and plotly

print("Computing UMAP 2D projection...")
# Create UMAP projection for visualization
umap_reducer = UMAP(
    n_components=2, 
    metric='cosine', 
    n_neighbors=15, 
    min_dist=0.1, 
    random_state=42
)

# Fit UMAP on the normalized embeddings
xy = umap_reducer.fit_transform(E_norm)

print(f"UMAP projection shape: {xy.shape}")

# Create visualization dataframes for both clustering methods
viz_df_kmeans = metadata_df.copy()
viz_df_kmeans['cluster'] = labels_kmeans
viz_df_kmeans['cluster_type'] = 'kmeans'
viz_df_kmeans['x'] = xy[:, 0]
viz_df_kmeans['y'] = xy[:, 1]

viz_df_hdbscan = metadata_df.copy()
viz_df_hdbscan['cluster'] = labels_hdbscan
viz_df_hdbscan['cluster_type'] = 'hdbscan'
viz_df_hdbscan['x'] = xy[:, 0]
viz_df_hdbscan['y'] = xy[:, 1]

# Add simple cluster labels (no LLM-generated labels)
viz_df_kmeans['cluster_label'] = viz_df_kmeans['cluster'].map(
    lambda x: f"K-means Cluster {x}"
)

# Add cluster labels for HDBSCAN
viz_df_hdbscan['cluster_label'] = viz_df_hdbscan['cluster'].map(
    lambda x: "Noise" if x == -1 else f"HDBSCAN Cluster {x}"
)

# Format the timestamp for better display
viz_df_kmeans['formatted_date'] = pd.to_datetime(viz_df_kmeans['ts_ingest']).dt.strftime('%Y-%m-%d %H:%M')
viz_df_hdbscan['formatted_date'] = pd.to_datetime(viz_df_hdbscan['ts_ingest']).dt.strftime('%Y-%m-%d %H:%M')

print(f"K-means visualization dataframe shape: {viz_df_kmeans.shape}")
print(f"HDBSCAN visualization dataframe shape: {viz_df_hdbscan.shape}")

# Set default visualization (you can change this to 'hdbscan' if preferred)
CLUSTER_TYPE_TO_VISUALIZE = 'kmeans'  # Change to 'hdbscan' to visualize HDBSCAN results

if CLUSTER_TYPE_TO_VISUALIZE == 'kmeans':
    viz_df = viz_df_kmeans
    print(f"\nUsing K-means clustering for visualization")
else:
    viz_df = viz_df_hdbscan
    print(f"\nUsing HDBSCAN clustering for visualization")

print("Sample of visualization data:")
print(viz_df[['thread_name', 'cluster', 'cluster_label', 'formatted_date']].head())


Computing UMAP 2D projection...


  warn(


UMAP projection shape: (11284, 2)
K-means visualization dataframe shape: (11284, 13)
HDBSCAN visualization dataframe shape: (11284, 13)

Using K-means clustering for visualization
Sample of visualization data:
                                 thread_name  cluster       cluster_label  \
0  Monetizing Creativity with Blockchain AGI       60  K-means Cluster 60   
1  Monetizing Creativity with Blockchain AGI       11  K-means Cluster 11   
2  Monetizing Creativity with Blockchain AGI       11  K-means Cluster 11   
3  Monetizing Creativity with Blockchain AGI       96  K-means Cluster 96   
4  Monetizing Creativity with Blockchain AGI       85  K-means Cluster 85   

     formatted_date  
0  2024-05-28 09:07  
1  2024-05-28 09:07  
2  2024-05-28 09:11  
3  2024-05-28 09:06  
4  2024-05-28 09:06  


In [12]:
# Create interactive plotly visualization

# Create the scatter plot
fig = px.scatter(
    viz_df, 
    x='x', 
    y='y', 
    color='cluster',
    hover_data={
        'thread_name': True,
        'formatted_date': True,
        'role': True,
        'content_preview': True,
        'cluster_label': True,
        'cluster_type': True,
        'x': False,  # Hide coordinates from hover
        'y': False,
        'cluster': False  # Hide raw cluster number
    },
    color_continuous_scale='viridis',
    title=f'AI Conversation Chunks - {CLUSTER_TYPE_TO_VISUALIZE.upper()} Clustering (2D UMAP)',
    labels={
        'x': 'UMAP Dimension 1',
        'y': 'UMAP Dimension 2',
        'cluster': 'Cluster ID'
    },
    width=1000,
    height=700
)

# Customize the hover template
fig.update_traces(
    hovertemplate='<b>%{hovertext}</b><br>' +
                  'Thread: %{customdata[0]}<br>' +
                  'Date: %{customdata[1]}<br>' +
                  'Role: %{customdata[2]}<br>' +
                  'Method: %{customdata[5]}<br>' +
                  'Cluster: %{customdata[4]}<br>' +
                  'Content: %{customdata[3]}<br>' +
                  '<extra></extra>',
    hovertext=viz_df['thread_name'],
    customdata=viz_df[['thread_name', 'formatted_date', 'role', 'content_preview', 'cluster_label', 'cluster_type']].values
)

# Update layout
fig.update_layout(
    title={
        'text': f'AI Conversation Chunks - {CLUSTER_TYPE_TO_VISUALIZE.upper()} Clustering (2D UMAP)',
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2',
    showlegend=True,
    hovermode='closest'
)

# Show the plot
fig.show()

print(f"\nVisualization complete!")
print(f"Clustering method: {CLUSTER_TYPE_TO_VISUALIZE}")
print(f"Total points: {len(viz_df)}")
print(f"Total clusters: {len(np.unique(viz_df['cluster']))}")
if CLUSTER_TYPE_TO_VISUALIZE == 'hdbscan':
    noise_points = np.sum(viz_df['cluster'] == -1)
    print(f"Noise points (HDBSCAN): {noise_points}")
print(f"Hover over points to see thread name, date, and content preview (up to 200 characters)")



Visualization complete!
Clustering method: kmeans
Total points: 11284
Total clusters: 106
Hover over points to see thread name, date, and content preview (up to 200 characters)


In [13]:
# Step 7: Analysis and Reporting (without concept_nodes)

# Generate cluster size analysis for both methods
print("=== CLUSTER SIZE ANALYSIS ===")

for cluster_type in ['kmeans', 'hdbscan']:
    print(f"\n--- {cluster_type.upper()} CLUSTERING ---")
    
    cluster_sizes = conn.execute("""
    SELECT 
        cluster,
        COUNT(*) AS size
    FROM chunk_cluster
    WHERE cluster_type = ?
    GROUP BY cluster
    ORDER BY size DESC
    """, [cluster_type]).df()
    
    print(f"Top 10 clusters by size ({cluster_type}):")
    print(cluster_sizes.head(10))
    
    # Show large clusters (potential duplicates)
    large_clusters = cluster_sizes[cluster_sizes['size'] > 10]
    if len(large_clusters) > 0:
        print(f"Large clusters (>10 items) - potential duplicate topics:")
        print(large_clusters)
    else:
        print(f"No large clusters found for {cluster_type}")

# Generate weekly trend data for the selected clustering method
weekly_trends = conn.execute("""
SELECT 
    DATE_TRUNC('week', ts_ingest) AS week,
    cluster,
    cluster_type,
    COUNT(*) AS message_count
FROM vw_chunks_with_name v
JOIN chunk_cluster c ON v.doc_id = c.doc_id
WHERE ts_ingest >= '2024-01-01' AND cluster_type = ?
GROUP BY week, cluster, cluster_type
ORDER BY week DESC, message_count DESC
""", [CLUSTER_TYPE_TO_VISUALIZE]).df()

print(f"\n=== WEEKLY TRENDS ({CLUSTER_TYPE_TO_VISUALIZE.upper()}) ===")
print(f"Found {len(weekly_trends)} week-cluster combinations")
print("Sample weekly trends:")
print(weekly_trends.head(10))

# Summary statistics for both methods
print(f"\n=== SUMMARY STATISTICS ===")
print(f"Total conversation chunks: {len(viz_df)}")

# K-means stats
kmeans_clusters = len(np.unique(labels_kmeans))
kmeans_sizes = np.bincount(labels_kmeans)
print(f"\nK-means clustering:")
print(f"  Total clusters: {kmeans_clusters}")
print(f"  Average cluster size: {len(viz_df) / kmeans_clusters:.1f}")
print(f"  Largest cluster size: {np.max(kmeans_sizes)}")
print(f"  Smallest cluster size: {np.min(kmeans_sizes)}")

# HDBSCAN stats
hdbscan_clusters = len(np.unique(labels_hdbscan))
hdbscan_sizes = np.bincount(labels_hdbscan[labels_hdbscan >= 0])  # Exclude noise
noise_count = np.sum(labels_hdbscan == -1)
print(f"\nHDBSCAN clustering:")
print(f"  Total clusters (excluding noise): {hdbscan_clusters - (1 if -1 in labels_hdbscan else 0)}")
print(f"  Noise points: {noise_count}")
if len(hdbscan_sizes) > 0:
    print(f"  Average cluster size: {len(hdbscan_sizes) / len(np.unique(labels_hdbscan[labels_hdbscan >= 0])):.1f}")
    print(f"  Largest cluster size: {np.max(hdbscan_sizes)}")
    print(f"  Smallest cluster size: {np.min(hdbscan_sizes)}")

# Show cluster size distribution for selected method
current_labels = labels_kmeans if CLUSTER_TYPE_TO_VISUALIZE == 'kmeans' else labels_hdbscan
cluster_sizes = np.bincount(current_labels[current_labels >= 0])  # Exclude noise for HDBSCAN
print(f"\nCluster size distribution ({CLUSTER_TYPE_TO_VISUALIZE}):")
print(f"  1-5 chunks: {np.sum((cluster_sizes >= 1) & (cluster_sizes <= 5))} clusters")
print(f"  6-10 chunks: {np.sum((cluster_sizes >= 6) & (cluster_sizes <= 10))} clusters")
print(f"  11-20 chunks: {np.sum((cluster_sizes >= 11) & (cluster_sizes <= 20))} clusters")
print(f"  21+ chunks: {np.sum(cluster_sizes >= 21)} clusters")


=== CLUSTER SIZE ANALYSIS ===

--- KMEANS CLUSTERING ---
Top 10 clusters by size (kmeans):
   cluster  size
0       69   264
1       46   258
2       14   248
3       55   240
4       60   226
5      100   220
6        2   210
7       57   204
8       96   199
9      103   197
Large clusters (>10 items) - potential duplicate topics:
     cluster  size
0         69   264
1         46   258
2         14   248
3         55   240
4         60   226
..       ...   ...
101       52    21
102       86    21
103       18    17
104       25    17
105       47    15

[106 rows x 2 columns]

--- HDBSCAN CLUSTERING ---
Top 10 clusters by size (hdbscan):
   cluster  size
0       -1  9772
1       45   410
2       17    65
3       79    50
4       84    46
5       68    41
6       35    38
7       63    32
8       87    31
9       62    29
Large clusters (>10 items) - potential duplicate topics:
    cluster  size
0        -1  9772
1        45   410
2        17    65
3        79    50
4        84    4

In [14]:
# Step 8: Save results

# Save visualization data
viz_df_kmeans.to_csv('data/cluster_visualization_kmeans.csv', index=False)
viz_df_hdbscan.to_csv('data/cluster_visualization_hdbscan.csv', index=False)
print("Saved visualization data to CSV files")

# Save cluster analysis
cluster_analysis = conn.execute("""
SELECT 
    cluster_type,
    cluster,
    COUNT(*) AS size
FROM chunk_cluster
GROUP BY cluster_type, cluster
ORDER BY cluster_type, size DESC
""").df()
cluster_analysis.to_csv('data/cluster_analysis.csv', index=False)
print("Saved cluster analysis to data/cluster_analysis.csv")

print("\n=== CLUSTERING COMPLETE ===")
print("✅ Both K-means and HDBSCAN clustering applied")
print("✅ Cluster labels persisted to DuckDB (both methods)")
print("✅ 2D visualization created with hover details")
print("✅ Ready for LLM-based concept distillation!")

print(f"\nDatabase tables created:")
print("- chunk_cluster: maps each doc_id to its cluster (with cluster_type)")
print("- Ready for LLM-based concept generation in next notebook!")


Saved visualization data to CSV files
Saved cluster analysis to data/cluster_analysis.csv

=== CLUSTERING COMPLETE ===
✅ Both K-means and HDBSCAN clustering applied
✅ Cluster labels persisted to DuckDB (both methods)
✅ 2D visualization created with hover details
✅ Ready for LLM-based concept distillation!

Database tables created:
- chunk_cluster: maps each doc_id to its cluster (with cluster_type)
- Ready for LLM-based concept generation in next notebook!


# TSNE viz


In [15]:
# Load existing data and confirm CSV recreation capability
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.manifold import TSNE
import duckdb

# Check if the CSV files exist and can recreate the plot
try:
    viz_kmeans = pd.read_csv('data/cluster_visualization_kmeans.csv')
    viz_hdbscan = pd.read_csv('data/cluster_visualization_hdbscan.csv')
    print("✅ CSV files loaded successfully!")
    print(f"K-means data shape: {viz_kmeans.shape}")
    print(f"HDBSCAN data shape: {viz_hdbscan.shape}")
    print(f"Columns: {list(viz_kmeans.columns)}")
    
    # Quick test plot
    print("\n📊 Testing plotly recreation from CSV...")
    fig = px.scatter(viz_kmeans.head(100), x='x', y='y', color='cluster', 
                    title="Test: UMAP visualization from CSV (first 100 points)")
    fig.show()
    print("✅ Plotly visualization works from CSV data!")
    
except FileNotFoundError as e:
    print(f"❌ CSV files not found: {e}")
    print("Need to run clustering first to generate the data.")
    viz_kmeans = None
    viz_hdbscan = None


✅ CSV files loaded successfully!
K-means data shape: (11284, 13)
HDBSCAN data shape: (11284, 13)
Columns: ['doc_id', 'thread_id', 'thread_name', 'role', 'content', 'ts_ingest', 'content_preview', 'cluster', 'cluster_type', 'x', 'y', 'cluster_label', 'formatted_date']

📊 Testing plotly recreation from CSV...


✅ Plotly visualization works from CSV data!


In [16]:
# Generate t-SNE 2D projections and save to CSV

if viz_kmeans is not None:
    print("🔄 Generating t-SNE projections...")
    
    # Connect to DuckDB and load embeddings
    conn = duckdb.connect("augmcp_v0.duckdb")
    
    # Get embeddings and metadata
    query = """
    SELECT doc_id, embedding
    FROM vw_chunks_with_name
    ORDER BY doc_id
    """
    embedding_df = conn.execute(query).df()
    
    # Extract embeddings matrix
    E = np.stack(embedding_df['embedding'].values)
    print(f"Loaded embeddings shape: {E.shape}")
    
    # Generate t-SNE projections (this may take a few minutes)
    print("⏳ Computing t-SNE (this may take 2-3 minutes)...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000, verbose=1)
    tsne_coords = tsne.fit_transform(E)
    
    print(f"✅ t-SNE projection complete! Shape: {tsne_coords.shape}")
    
    # Add t-SNE coordinates to existing dataframes
    viz_kmeans_tsne = viz_kmeans.copy()
    viz_hdbscan_tsne = viz_hdbscan.copy()
    
    # Replace UMAP coordinates with t-SNE coordinates
    viz_kmeans_tsne['x'] = tsne_coords[:, 0]
    viz_kmeans_tsne['y'] = tsne_coords[:, 1]
    viz_hdbscan_tsne['x'] = tsne_coords[:, 0] 
    viz_hdbscan_tsne['y'] = tsne_coords[:, 1]
    
    # Update cluster labels to indicate t-SNE
    viz_kmeans_tsne['cluster_label'] = viz_kmeans_tsne['cluster'].map(
        lambda x: f"K-means Cluster {x} (t-SNE)"
    )
    viz_hdbscan_tsne['cluster_label'] = viz_hdbscan_tsne['cluster'].map(
        lambda x: "Noise (t-SNE)" if x == -1 else f"HDBSCAN Cluster {x} (t-SNE)"
    )
    
    # Save t-SNE CSV files
    viz_kmeans_tsne.to_csv('data/cluster_visualization_kmeans_tsne.csv', index=False)
    viz_hdbscan_tsne.to_csv('data/cluster_visualization_hdbscan_tsne.csv', index=False)
    
    print("✅ Saved t-SNE visualization data:")
    print("  - data/cluster_visualization_kmeans_tsne.csv")
    print("  - data/cluster_visualization_hdbscan_tsne.csv")
    
    # Quick test of t-SNE plot
    print("\n📊 Testing t-SNE visualization...")
    fig_tsne = px.scatter(
        viz_kmeans_tsne.head(100), 
        x='x', y='y', 
        color='cluster',
        title="Test: t-SNE visualization from CSV (first 100 points)",
        labels={'x': 't-SNE Dimension 1', 'y': 't-SNE Dimension 2'}
    )
    fig_tsne.show()
    print("✅ t-SNE plotly visualization works!")
    
    conn.close()
    
else:
    print("❌ Cannot generate t-SNE: CSV files not available")
    print("Run the clustering notebook first to generate the base data.")


🔄 Generating t-SNE projections...
Loaded embeddings shape: (11284, 1536)
⏳ Computing t-SNE (this may take 2-3 minutes)...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 11284 samples in 0.011s...



'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



[t-SNE] Computed neighbors for 11284 samples in 4.245s...
[t-SNE] Computed conditional probabilities for sample 1000 / 11284
[t-SNE] Computed conditional probabilities for sample 2000 / 11284
[t-SNE] Computed conditional probabilities for sample 3000 / 11284
[t-SNE] Computed conditional probabilities for sample 4000 / 11284
[t-SNE] Computed conditional probabilities for sample 5000 / 11284
[t-SNE] Computed conditional probabilities for sample 6000 / 11284
[t-SNE] Computed conditional probabilities for sample 7000 / 11284
[t-SNE] Computed conditional probabilities for sample 8000 / 11284
[t-SNE] Computed conditional probabilities for sample 9000 / 11284
[t-SNE] Computed conditional probabilities for sample 10000 / 11284
[t-SNE] Computed conditional probabilities for sample 11000 / 11284
[t-SNE] Computed conditional probabilities for sample 11284 / 11284
[t-SNE] Mean sigma: 0.273469
[t-SNE] KL divergence after 250 iterations with early exaggeration: 91.368408
[t-SNE] KL divergence after 

✅ t-SNE plotly visualization works!
