In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from selector import IntraClusterSimilarityFilter
from storage import KnowledgeStore
from clusterer import UMAPHDBSCANClusterer
import config

# Load documents from LanceDB
store = KnowledgeStore("/Users/chris/repos/openaugi/data/vector_db")
atomic_notes = store.get_all_atomic_notes()
print(f"Loaded {len(atomic_notes)} atomic notes from LanceDB")



In [None]:
# Cluster the documents
clusterer = UMAPHDBSCANClusterer()
clusters = clusterer.cluster_documents(atomic_notes)
print(f"Found {len(clusters)} clusters")


In [None]:
# Select a cluster to examine (change the index to examine different clusters)
cluster_idx = 0  # First cluster
if len(clusters) > 0:
    selected_cluster = clusters[cluster_idx]
    print(f"Examining cluster {cluster_idx} with {len(selected_cluster)} documents")

    # Create a similarity filter
    similarity_filter = IntraClusterSimilarityFilter()

    # Compute similarity matrix for the selected cluster
    similarity_matrix = similarity_filter._compute_similarity_matrix(selected_cluster)

    # Print document titles for reference
    print("\nDocuments in this cluster:")
    for i, doc in enumerate(selected_cluster):
        title = doc.metadata.get("idea_title", f"Document {i}")
        print(f"{i}: {title}")

    # Visualize the similarity matrix as a heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_matrix, annot=True, cmap='viridis',
                xticklabels=range(len(selected_cluster)),
                yticklabels=range(len(selected_cluster)))
    plt.title(f"Similarity Matrix for Cluster {cluster_idx}")
    plt.show()

    # Find similarity groups within this cluster
    similarity_groups = similarity_filter.find_similarity_groups(selected_cluster)
    print(f"\nFound {len(similarity_groups)} similarity groups in this cluster")

    # Print the similarity groups
    for i, group in enumerate(similarity_groups):
        print(f"\nSimilarity Group {i}:")
        for doc in group:
            title = doc.metadata.get("idea_title", "Untitled")
            print(f"- {title}")
else:
    print("No clusters found")