# HDBSCAN Parameter Testing

This notebook helps you find optimal HDBSCAN parameters by testing different combinations on a sample of your data.

**Purpose**: Before running full clustering on 27K speeches, test parameters on a smaller sample (5K speeches) to find the best settings.

**What it tests**:
- Different `min_cluster_size` values (30, 50, 75, 100, 150)
- Different `min_samples` values (5, 10, 15, 20)
- Metrics: number of clusters, outlier percentage, silhouette score, cluster sizes

## 1. Setup and Configuration

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
import hdbscan
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Set style for better looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100

print("‚úÖ Imports complete")

In [None]:
# Configuration
SAMPLE_SIZE = 5000  # Number of speeches to test on (adjust as needed)
KEYWORDS_CSV = "../data/speech_keywords.csv"
EMBEDDINGS_FILE = "../data/keyword_embeddings.npy"
OUTPUT_RESULTS = "../data/hdbscan_parameter_results.csv"
OUTPUT_PLOT = "../data/hdbscan_parameter_test.png"

# Parameters to test
MIN_CLUSTER_SIZES = [30, 50, 75, 100, 150]
MIN_SAMPLES_LIST = [5, 10, 15, 20]
METRICS = ['euclidean']

print(f"üìä Configuration:")
print(f"   Sample size: {SAMPLE_SIZE:,}")
print(f"   Testing {len(MIN_CLUSTER_SIZES)} √ó {len(MIN_SAMPLES_LIST)} √ó {len(METRICS)} = {len(MIN_CLUSTER_SIZES) * len(MIN_SAMPLES_LIST) * len(METRICS)} combinations")

## 2. Load Sample Data

In [None]:
print(f"üì• Loading sample data (n={SAMPLE_SIZE:,})...\n")

# Load CSV
df = pd.read_csv(KEYWORDS_CSV)
print(f"‚úÖ Loaded CSV: {len(df):,} total speeches")

# Sample randomly
if SAMPLE_SIZE < len(df):
    df_sample = df.sample(n=SAMPLE_SIZE, random_state=42)
    indices = df_sample.index.tolist()
    print(f"   Sampled {len(df_sample):,} speeches for testing")
else:
    df_sample = df
    indices = list(range(len(df)))
    print(f"   Using all {len(df_sample):,} speeches")

# Load embeddings
try:
    embeddings = np.load(EMBEDDINGS_FILE)
    embeddings_sample = embeddings[indices]
    print(f"\n‚úÖ Loaded embeddings: {embeddings_sample.shape}")
    print(f"   Dimension: {embeddings_sample.shape[1]}")
    print(f"   Memory: {embeddings_sample.nbytes / 1e6:.2f} MB")
except FileNotFoundError:
    print(f"\n‚ùå Error: Embeddings file not found at {EMBEDDINGS_FILE}")
    print(f"   Please run the main clustering notebook first to generate embeddings.")
    raise

## 3. Test HDBSCAN Parameters

In [None]:
def test_single_configuration(embeddings, min_cluster_size, min_samples, metric):
    """
    Test a single HDBSCAN parameter configuration.
    
    Returns dict with results.
    """
    try:
        # Run HDBSCAN
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric=metric
        )
        labels = clusterer.fit_predict(embeddings)
        
        # Calculate metrics
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_outliers = np.sum(labels == -1)
        outlier_pct = n_outliers / len(labels) * 100
        
        # Silhouette score (only if we have at least 2 clusters and not all outliers)
        silhouette = None
        if n_clusters >= 2 and n_outliers < len(labels):
            mask = labels != -1
            if mask.sum() > 0:
                try:
                    silhouette = silhouette_score(embeddings[mask], labels[mask])
                except:
                    silhouette = None
        
        # Cluster sizes
        cluster_sizes = pd.Series(labels[labels != -1]).value_counts()
        avg_cluster_size = cluster_sizes.mean() if len(cluster_sizes) > 0 else 0
        min_size = cluster_sizes.min() if len(cluster_sizes) > 0 else 0
        max_size = cluster_sizes.max() if len(cluster_sizes) > 0 else 0
        
        return {
            'min_cluster_size': min_cluster_size,
            'min_samples': min_samples,
            'metric': metric,
            'n_clusters': n_clusters,
            'n_outliers': n_outliers,
            'outlier_pct': outlier_pct,
            'silhouette': silhouette,
            'avg_cluster_size': avg_cluster_size,
            'min_size': min_size,
            'max_size': max_size,
            'status': 'success'
        }
        
    except Exception as e:
        return {
            'min_cluster_size': min_cluster_size,
            'min_samples': min_samples,
            'metric': metric,
            'status': f'error: {str(e)}'
        }

print("‚úÖ Test function defined")

In [None]:
# Run all parameter combinations
total_tests = len(MIN_CLUSTER_SIZES) * len(MIN_SAMPLES_LIST) * len(METRICS)

print(f"üî¨ Testing {total_tests} parameter combinations...")
print(f"   min_cluster_size: {MIN_CLUSTER_SIZES}")
print(f"   min_samples: {MIN_SAMPLES_LIST}")
print(f"   metrics: {METRICS}")
print(f"\n{'='*80}\n")

results = []
test_num = 0

for min_cluster_size, min_samples, metric in product(MIN_CLUSTER_SIZES, MIN_SAMPLES_LIST, METRICS):
    test_num += 1
    print(f"[{test_num:2d}/{total_tests}] Testing: min_cluster_size={min_cluster_size:3d}, min_samples={min_samples:2d}, metric={metric}")
    
    result = test_single_configuration(embeddings_sample, min_cluster_size, min_samples, metric)
    results.append(result)
    
    if result.get('status') == 'success':
        silhouette_str = f"{result['silhouette']:.3f}" if result['silhouette'] else 'N/A'
        print(f"         ‚Üí {result['n_clusters']:3d} clusters, {result['n_outliers']:5,} outliers ({result['outlier_pct']:5.1f}%), silhouette={silhouette_str}")
    else:
        print(f"         ‚ùå {result['status']}")

# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df = results_df[results_df['status'] == 'success'].drop('status', axis=1)

print(f"\n{'='*80}")
print(f"‚úÖ Testing complete! {len(results_df)} successful tests")

## 4. Analyze Results

In [None]:
# Display full results table
print("üìä All Results:\n")
display_df = results_df.copy()
display_df['silhouette'] = display_df['silhouette'].round(3)
display_df['outlier_pct'] = display_df['outlier_pct'].round(1)
display_df['avg_cluster_size'] = display_df['avg_cluster_size'].round(1)

display_df

In [None]:
# Top configurations by different metrics
print("="*80)
print("TOP CONFIGURATIONS BY DIFFERENT METRICS")
print("="*80)

print("\nüìà Top 5 by Silhouette Score (higher is better):")
top_silhouette = results_df.nlargest(5, 'silhouette')[['min_cluster_size', 'min_samples', 'n_clusters', 'outlier_pct', 'silhouette', 'avg_cluster_size']]
print(top_silhouette.to_string(index=False))

print("\n\nüìä Top 5 by Number of Clusters (more granular topics):")
top_clusters = results_df.nlargest(5, 'n_clusters')[['min_cluster_size', 'min_samples', 'n_clusters', 'outlier_pct', 'silhouette', 'avg_cluster_size']]
print(top_clusters.to_string(index=False))

print("\n\nüéØ Lowest Outlier Percentage (more speeches clustered):")
low_outliers = results_df.nsmallest(5, 'outlier_pct')[['min_cluster_size', 'min_samples', 'n_clusters', 'outlier_pct', 'silhouette', 'avg_cluster_size']]
print(low_outliers.to_string(index=False))

print("\n\n‚öñÔ∏è  Balanced Configurations (moderate clusters, low outliers, good silhouette):")
# Score based on normalized metrics
results_df['balance_score'] = (
    (results_df['silhouette'].fillna(0) / results_df['silhouette'].max()) * 0.4 +
    (1 - results_df['outlier_pct'] / results_df['outlier_pct'].max()) * 0.4 +
    (results_df['n_clusters'] / results_df['n_clusters'].max()) * 0.2
)
balanced = results_df.nlargest(5, 'balance_score')[['min_cluster_size', 'min_samples', 'n_clusters', 'outlier_pct', 'silhouette', 'avg_cluster_size']]
print(balanced.to_string(index=False))

## 5. Visualize Results

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Number of clusters vs min_cluster_size
ax = axes[0, 0]
for min_samples in sorted(results_df['min_samples'].unique()):
    data = results_df[results_df['min_samples'] == min_samples].sort_values('min_cluster_size')
    ax.plot(data['min_cluster_size'], data['n_clusters'], marker='o', linewidth=2, label=f'min_samples={min_samples}')
ax.set_xlabel('min_cluster_size', fontsize=11)
ax.set_ylabel('Number of Clusters', fontsize=11)
ax.set_title('Number of Clusters vs min_cluster_size', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# 2. Outlier percentage vs min_cluster_size
ax = axes[0, 1]
for min_samples in sorted(results_df['min_samples'].unique()):
    data = results_df[results_df['min_samples'] == min_samples].sort_values('min_cluster_size')
    ax.plot(data['min_cluster_size'], data['outlier_pct'], marker='o', linewidth=2, label=f'min_samples={min_samples}')
ax.set_xlabel('min_cluster_size', fontsize=11)
ax.set_ylabel('Outlier Percentage (%)', fontsize=11)
ax.set_title('Outlier Percentage vs min_cluster_size', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# 3. Silhouette score vs min_cluster_size
ax = axes[1, 0]
valid_silhouette = results_df[results_df['silhouette'].notna()]
for min_samples in sorted(valid_silhouette['min_samples'].unique()):
    data = valid_silhouette[valid_silhouette['min_samples'] == min_samples].sort_values('min_cluster_size')
    ax.plot(data['min_cluster_size'], data['silhouette'], marker='o', linewidth=2, label=f'min_samples={min_samples}')
ax.set_xlabel('min_cluster_size', fontsize=11)
ax.set_ylabel('Silhouette Score', fontsize=11)
ax.set_title('Silhouette Score vs min_cluster_size (higher is better)', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# 4. Average cluster size vs min_cluster_size
ax = axes[1, 1]
for min_samples in sorted(results_df['min_samples'].unique()):
    data = results_df[results_df['min_samples'] == min_samples].sort_values('min_cluster_size')
    ax.plot(data['min_cluster_size'], data['avg_cluster_size'], marker='o', linewidth=2, label=f'min_samples={min_samples}')
ax.set_xlabel('min_cluster_size', fontsize=11)
ax.set_ylabel('Average Cluster Size', fontsize=11)
ax.set_title('Average Cluster Size vs min_cluster_size', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_PLOT, dpi=150, bbox_inches='tight')
print(f"üíæ Saved visualization to: {OUTPUT_PLOT}")
plt.show()

## 6. Parameter Recommendations

In [None]:
print("="*80)
print("PARAMETER RECOMMENDATIONS")
print("="*80)

# Find best overall configuration
best_idx = results_df['balance_score'].idxmax()
best_config = results_df.loc[best_idx]

print(f"\nüèÜ RECOMMENDED CONFIGURATION (Best Balanced):")
print(f"   min_cluster_size = {int(best_config['min_cluster_size'])}")
print(f"   min_samples = {int(best_config['min_samples'])}")
print(f"   metric = '{best_config['metric']}'")
print(f"\n   Expected results:")
print(f"   - Number of clusters: {int(best_config['n_clusters'])}")
print(f"   - Outlier percentage: {best_config['outlier_pct']:.1f}%")
print(f"   - Silhouette score: {best_config['silhouette']:.3f}" if best_config['silhouette'] else "   - Silhouette score: N/A")
print(f"   - Average cluster size: {best_config['avg_cluster_size']:.0f}")

print("\n\nüí° GUIDELINES FOR ADJUSTING:")
print("\n   For MORE fine-grained topics:")
print("   ‚Üí Use lower min_cluster_size (30-50)")
print("   ‚Üí This creates more, smaller clusters")

print("\n   For FEWER, larger topics:")
print("   ‚Üí Use higher min_cluster_size (100-150)")
print("   ‚Üí This merges similar speeches into bigger clusters")

print("\n   For FEWER outliers:")
print("   ‚Üí Decrease min_cluster_size AND min_samples")
print("   ‚Üí More speeches will be assigned to clusters")

print("\n   For BETTER cluster quality:")
print("   ‚Üí Choose parameters with higher silhouette score")
print("   ‚Üí This indicates tighter, more separated clusters")

print("\n\nüìù NEXT STEPS:")
print("   1. Use the recommended parameters in the main clustering notebook")
print("   2. Run clustering on full dataset (27K speeches)")
print("   3. Inspect sample speeches from different clusters")
print("   4. Adjust parameters if needed and re-run")
print("\n" + "="*80)

## 7. Save Results

In [None]:
# Save detailed results to CSV
results_df.to_csv(OUTPUT_RESULTS, index=False)
print(f"üíæ Saved detailed results to: {OUTPUT_RESULTS}")
print(f"\nüìä Results summary:")
print(f"   Total configurations tested: {len(results_df)}")
print(f"   Configurations with valid silhouette: {results_df['silhouette'].notna().sum()}")
print(f"   File size: {pd.read_csv(OUTPUT_RESULTS).memory_usage(deep=True).sum() / 1024:.2f} KB")