In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display
from scipy.stats import mannwhitneyu

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
%matplotlib inline

## 1. Load Data

In [None]:
# Load cluster assignments and features
clusters = pd.read_csv("../data/processed/cluster_assignments_kmeans.csv")
features = pd.read_csv("../data/processed/graph_structural_features.csv")
pca = pd.read_csv("../data/processed/cluster_pca_coordinates.csv")
summation = pd.read_csv("../data/processed/summation_test_results.csv")

# Load provenance from CSV
hierarchy = pd.read_csv("../data/processed/cord_hierarchy.csv")
provenance = hierarchy[['KHIPU_ID', 'PROVENANCE']].drop_duplicates()

# Merge all data
data = clusters.merge(features, on='khipu_id').merge(
    pca, on='khipu_id'
).merge(
    provenance, left_on='khipu_id', right_on='KHIPU_ID', how='left'
).merge(
    summation, on='khipu_id', how='left'
)

data['PROVENANCE'] = data['PROVENANCE'].fillna('Unknown')

num_khipus = len(data)
num_features = len(data.columns)
clusters_list = sorted(data['cluster'].unique())
num_provenances = data['PROVENANCE'].nunique()

print(f"✓ Loaded {num_khipus} khipus with {num_features} features")
print(f"✓ Clusters: {clusters_list}")
print(f"✓ Provenances: {num_provenances}")

## 2. Cluster Overview

In [None]:
# Cluster summary statistics
cluster_summary = data.groupby('cluster').agg({
    'khipu_id': 'count',
    'num_nodes': 'mean',
    'depth': 'mean',
    'avg_branching': 'mean',
    'has_numeric': 'mean',
    'has_pendant_summation': 'mean'
}).round(2)

cluster_summary.columns = ['Count', 'Avg Size', 'Avg Depth', 'Avg Branch', 'Numeric %', 'Summation %']
cluster_summary['Numeric %'] = (cluster_summary['Numeric %'] * 100).round(1)
cluster_summary['Summation %'] = (cluster_summary['Summation %'] * 100).round(1)

print("\nCluster Summary:")
display(cluster_summary)

## 3. Interactive Cluster Filter

In [None]:
# Create interactive widgets
cluster_select = widgets.SelectMultiple(
    options=sorted(data['cluster'].unique()),
    value=[0, 1, 2, 3, 4, 5, 6],
    description='Clusters:',
    disabled=False
)

provenance_select = widgets.SelectMultiple(
    options=sorted(data['PROVENANCE'].unique()),
    value=['Incahuasi', 'Pachacamac', 'Unknown'],
    description='Provenance:',
    disabled=False,
    rows=8
)

size_slider = widgets.IntRangeSlider(
    value=[0, 1200],
    min=0,
    max=int(data['num_nodes'].max()),
    step=10,
    description='Size (nodes):',
    continuous_update=False
)

numeric_slider = widgets.FloatRangeSlider(
    value=[0.0, 1.0],
    min=0.0,
    max=1.0,
    step=0.1,
    description='Numeric %:',
    continuous_update=False
)

def filter_data(selected_clusters, selected_provenances, size_range, numeric_range):
    filtered = data[
        data['cluster'].isin(selected_clusters) &
        data['PROVENANCE'].isin(selected_provenances) &
        (data['num_nodes'] >= size_range[0]) &
        (data['num_nodes'] <= size_range[1]) &
        (data['has_numeric'] >= numeric_range[0]) &
        (data['has_numeric'] <= numeric_range[1])
    ]
    
    filter_pct = len(filtered) / len(data) * 100
    print(f"\n{len(filtered)} khipus match filters ({filter_pct:.1f}%)")
    
    # PCA scatter plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    fig.suptitle(f"Filtered Khipus Analysis ({filter_pct:.1f}% of dataset)", fontsize=15, fontweight='bold', y=1.02)
    
    
    # By cluster
    for cluster in sorted(filtered['cluster'].unique()):
        cluster_data = filtered[filtered['cluster'] == cluster]
        cluster_count = len(cluster_data)
        ax1.scatter(cluster_data['pc1'], cluster_data['pc2'], 
                   label=f'Cluster {cluster} (n={cluster_count})',
                   alpha=0.6, s=50)
    
    ax1.set_xlabel('PC1 (45.7% variance)', fontsize=12)
    ax1.set_ylabel('PC2 (16.1% variance)', fontsize=12)
    ax1.set_title('Filtered Khipus by Cluster', fontsize=14, fontweight='bold')
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax1.grid(True, alpha=0.3)
    
    # Size vs depth
    scatter = ax2.scatter(filtered['num_nodes'], filtered['depth'],
                         c=filtered['cluster'], cmap='tab10',
                         alpha=0.6, s=50)
    ax2.set_xlabel('Size (nodes)', fontsize=12)
    ax2.set_ylabel('Hierarchy Depth', fontsize=12)
    ax2.set_title('Size vs Depth', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    plt.colorbar(scatter, ax=ax2, label='Cluster')
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    summary = filtered.groupby('cluster').agg({
        'khipu_id': 'count',
        'num_nodes': ['mean', 'std'],
        'depth': ['mean', 'std'],
        'has_pendant_summation': 'mean'
    }).round(2)
    
    print("\nFiltered Cluster Statistics:")
    display(summary)
    
    return filtered

# Create interactive output
output = widgets.interactive_output(
    filter_data,
    {
        'selected_clusters': cluster_select,
        'selected_provenances': provenance_select,
        'size_range': size_slider,
        'numeric_range': numeric_slider
    }
)

display(widgets.VBox([
    widgets.HBox([cluster_select, provenance_select]),
    size_slider,
    numeric_slider,
    output
]))

## 4. Cluster Comparison

In [None]:
# Compare two clusters side-by-side
cluster_a = widgets.Dropdown(
    options=sorted(data['cluster'].unique()),
    value=0,
    description='Cluster A:'
)

cluster_b = widgets.Dropdown(
    options=sorted(data['cluster'].unique()),
    value=3,
    description='Cluster B:'
)

def compare_clusters(cluster_a_val, cluster_b_val):
    data_a = data[data['cluster'] == cluster_a_val]
    data_b = data[data['cluster'] == cluster_b_val]
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    fig.suptitle(f'Cluster {cluster_a_val} (n={len(data_a)}) vs Cluster {cluster_b_val} (n={len(data_b)})',
                fontsize=16, fontweight='bold')
    
    features_to_compare = [
        ('num_nodes', 'Size (nodes)'),
        ('depth', 'Hierarchy Depth'),
        ('avg_branching', 'Avg Branching'),
        ('has_numeric', 'Numeric Coverage'),
        ('pendant_match_rate', 'Summation Match Rate'),
        ('num_white_boundaries', 'White Boundaries')
    ]
    
    for idx, (feature, label) in enumerate(features_to_compare):
        ax = axes[idx // 3, idx % 3]
        
        # Violin plot
        ax.violinplot([data_a[feature].dropna(), data_b[feature].dropna()],
                     positions=[1, 2], showmeans=True, showmedians=True)
        
        ax.set_xticks([1, 2])
        ax.set_xticklabels([f'C{cluster_a_val}', f'C{cluster_b_val}'])
        ax.set_ylabel(label, fontsize=11)
        ax.grid(True, alpha=0.3)
        
        # Add statistics
        mean_a = data_a[feature].mean()
        mean_b = data_b[feature].mean()
        ax.set_title(f'Mean: {mean_a:.2f} vs {mean_b:.2f}', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    # Statistical comparison
    print("\nStatistical Comparison (Mann-Whitney U test):")
    for feature, label in features_to_compare:
        _, pval = mannwhitneyu(data_a[feature].dropna(), data_b[feature].dropna())
        sig = "***" if pval < 0.001 else "**" if pval < 0.01 else "*" if pval < 0.05 else "ns"
        print(f"  {label}: p={pval:.4f} {sig}")

output2 = widgets.interactive_output(
    compare_clusters,
    {'cluster_a_val': cluster_a, 'cluster_b_val': cluster_b}
)

display(widgets.HBox([cluster_a, cluster_b]))
display(output2)

## 5. Export Filtered Data

In [None]:
# Export current filtered dataset
def export_filtered_data(filtered_data, filename="filtered_khipus.csv"):
    output_path = f"../data/processed/{filename}"
    filtered_data.to_csv(output_path, index=False)
    print(f"✓ Exported {len(filtered_data)} khipus to {output_path}")

# Example: Export all Cluster 0 khipus
# cluster_0 = data[data['cluster'] == 0]
# export_filtered_data(cluster_0, "cluster_0_incahuasi.csv")

## 6. Explore Specific Khipus

In [None]:
# Look up specific khipu by ID
khipu_id_input = widgets.Text(
    value='1000606',
    placeholder='Enter khipu ID',
    description='Khipu ID:'
)

def show_khipu_details(khipu_id):
    khipu = data[data['khipu_id'] == khipu_id]
    
    if len(khipu) == 0:
        print(f"Khipu {khipu_id} not found")
        return
    
    khipu = khipu.iloc[0]
    
    print("=" * 60)
    print(f"KHIPU {khipu_id}")
    print("=" * 60)
    print(f"Cluster: {khipu['cluster']}")
    print(f"Provenance: {khipu['PROVENANCE']}")
    print("\nStructure:")
    print(f"  Size: {khipu['num_nodes']} nodes")
    print(f"  Depth: {khipu['depth']} levels")
    print(f"  Avg Branching: {khipu['avg_branching']:.2f}")
    print(f"  Numeric Coverage: {khipu['has_numeric']*100:.1f}%")
    print("\nSummation:")
    print(f"  Has Summation: {khipu['has_pendant_summation']}")
    print(f"  Match Rate: {khipu['pendant_match_rate']:.3f}")
    print(f"  White Boundaries: {khipu['num_white_boundaries']}")
    print("\nPCA Coordinates:")
    print(f"  PC1: {khipu['pc1']:.3f}")
    print(f"  PC2: {khipu['pc2']:.3f}")

output3 = widgets.interactive_output(show_khipu_details, {'khipu_id': khipu_id_input})
display(khipu_id_input)
display(output3)