# Advanced Clustering and Semantic Naming

This notebook performs sophisticated clustering and generates meaningful cluster names:
1. **HDBSCAN-kNN Clustering**: Advanced clustering with noise refinement
2. **Density Analysis**: Classify points by local density patterns
3. **LLM-Based Naming**: Generate descriptive cluster names using GPT-4o-mini
4. **Hierarchical Organization**: Create multi-level cluster taxonomy

**Input**: Papers with embeddings and initial clustering  
**Output**: Named clusters organized in hierarchical structure

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import hdbscan
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from scipy.spatial.distance import cdist
import openai
import json
import time
from tqdm import tqdm

# Configuration
COORD_FILE = '/content/drive/MyDrive/100-5D_UMAP_Coordinates.pkl'
MAPPING_FILE = '/content/drive/MyDrive/research_map_coordinates_for_mapping_africa.csv'
TARGET_DIM = 'coords_30d'
MIN_CLUSTER_SIZE = 10
OPENAI_API_KEY = "**input your api key"
SEED = 42

# Load datasets
df_coords = pd.read_pickle(COORD_FILE)
df_map = pd.read_csv(MAPPING_FILE)
df = pd.merge(df_coords, df_map, on='EID')

# Prepare coordinates
X_30d = np.vstack(df[TARGET_DIM].values).astype('float32')
df['umapX'] = df['umap_2d_x']
df['umapY'] = df['umap_2d_y']

print(f"üìä Dataset loaded: {df.shape}")
print(f"üßÆ Coordinate matrix: {X_30d.shape}")

## 2. HDBSCAN-kNN Clustering

Advanced clustering that refines noise points using k-nearest neighbors.

In [None]:
def perform_hdbscan_knn_clustering(X, min_cluster_size=10):
    """Perform HDBSCAN clustering with kNN refinement"""
    
    print("üîÑ Running initial HDBSCAN clustering...")
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        gen_min_span_tree=True
    )
    initial_labels = clusterer.fit_predict(X)
    
    # Analyze initial results
    core_mask = (initial_labels != -1)
    noise_mask = (initial_labels == -1)
    n_clusters = len(set(initial_labels)) - (1 if -1 in initial_labels else 0)
    noise_ratio = noise_mask.mean()
    
    print(f"üìä Initial clustering results:")
    print(f"   üè∑Ô∏è  Clusters found: {n_clusters}")
    print(f"   üîá Noise ratio: {noise_ratio:.1%}")
    
    # Refine noise points using kNN
    final_labels = initial_labels.copy()
    cluster_spectrum = [''] * len(initial_labels)
    
    if noise_mask.any() and core_mask.any():
        print("üîÑ Refining noise points with k-nearest neighbors...")
        
        knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
        knn.fit(X[core_mask], initial_labels[core_mask])
        
        # Predict cluster assignments for noise points
        noise_predictions = knn.predict(X[noise_mask])
        prediction_probs = knn.predict_proba(X[noise_mask])
        
        final_labels[noise_mask] = noise_predictions
        
        # Generate confidence spectrum for each point
        classes = knn.classes_
        noise_indices = np.where(noise_mask)[0]
        
        for i, probs in enumerate(prediction_probs):
            top_3 = probs.argsort()[-3:][::-1]
            spectrum_parts = [
                f"Clust{classes[idx]}({probs[idx]*100:.0f}%)"
                for idx in top_3 if probs[idx] >= 0.05
            ]
            cluster_spectrum[noise_indices[i]] = ", ".join(spectrum_parts)
    
    # Set spectrum for core points
    for i, label in enumerate(initial_labels):
        if label != -1:
            cluster_spectrum[i] = f"Clust{label}(100%)"
    
    refined_noise_ratio = (final_labels == -1).mean()
    print(f"‚úÖ Refinement complete. Noise reduced to: {refined_noise_ratio:.1%}")
    
    return final_labels, cluster_spectrum, initial_labels

# Perform clustering
cluster_labels, spectrum, original_labels = perform_hdbscan_knn_clustering(X_30d, MIN_CLUSTER_SIZE)

df['cluster_label'] = cluster_labels
df['cluster_spectrum'] = spectrum
df['original_label'] = original_labels

## 3. Density Analysis and Classification

Analyze local density patterns to classify points into density-based categories.

In [None]:
def analyze_density_patterns(X, cluster_labels, k=16):
    """Analyze local density and classify points"""
    
    print("üîÑ Calculating local density scores...")
    
    # Calculate k-nearest neighbor distances
    nn = NearestNeighbors(n_neighbors=k).fit(X)
    distances, _ = nn.kneighbors(X)
    
    # Compute density score (inverse of average distance)
    density_scores = np.sum(1.0 / (distances[:, 1:] + 1e-5), axis=1)
    
    # Find density thresholds using knee detection
    def find_knee_point(values):
        """Find the knee point in a sorted curve"""
        x = np.arange(len(values))
        y = values
        
        # Vector from first to last point
        line_vec = np.array([x[-1] - x[0], y[-1] - y[0]])
        line_vec = line_vec / np.linalg.norm(line_vec)
        
        # Calculate perpendicular distances
        points = np.vstack((x - x[0], y - y[0])).T
        projections = np.outer(np.dot(points, line_vec), line_vec)
        distances_to_line = np.linalg.norm(points - projections, axis=1)
        
        return np.argmax(distances_to_line)
    
    # Analyze noise points only
    noise_mask = (cluster_labels == -1)
    density_classes = ['core'] * len(cluster_labels)
    
    if noise_mask.any():
        noise_densities = density_scores[noise_mask]
        sorted_densities = np.sort(noise_densities)[::-1]
        log_densities = np.log(sorted_densities + 1e-9)
        
        # Find two knee points for classification
        knee1_idx = find_knee_point(log_densities)
        knee2_idx = knee1_idx + find_knee_point(log_densities[knee1_idx:])
        
        threshold1 = np.exp(log_densities[knee1_idx])
        threshold2 = np.exp(log_densities[knee2_idx])
        
        print(f"üìà Density thresholds: T1={threshold1:.2f}, T2={threshold2:.2f}")
        
        # Classify density levels
        for i, (label, density) in enumerate(zip(cluster_labels, density_scores)):
            if label == -1:  # Only classify noise points
                if density <= threshold2:
                    density_classes[i] = 'outlier'
                elif density <= threshold1:
                    density_classes[i] = 'intermediate'
        
        # Visualization
        plt.figure(figsize=(10, 4))
        plt.plot(log_densities, label='Log Density')
        plt.axhline(np.log(threshold1), color='orange', linestyle='--', label=f'T1: {threshold1:.2f}')
        plt.axhline(np.log(threshold2), color='red', linestyle='--', label=f'T2: {threshold2:.2f}')
        plt.title('Density Threshold Detection')
        plt.xlabel('Sorted Points')
        plt.ylabel('Log Density')
        plt.legend()
        plt.show()
    
    return density_scores, density_classes

# Perform density analysis
density_scores, density_classes = analyze_density_patterns(X_30d, cluster_labels)

df['density_score'] = density_scores
df['density_class'] = density_classes

print("‚úÖ Density analysis complete")

## 4. Merge Paper Metadata

In [None]:
# Load cleaned paper data with titles and abstracts
paper_files = [
    "/content/MUSearch/Cleaned Data/2021_cleaned_data.csv",
    "/content/MUSearch/Cleaned Data/2022_cleaned_data.csv",
    "/content/MUSearch/Cleaned Data/2023_cleaned_data.csv",
    "/content/MUSearch/Cleaned Data/2024_cleaned_data.csv",
    "/content/MUSearch/Cleaned Data/2025_cleaned_data.csv"
]

paper_dataframes = []
for file_path in paper_files:
    if os.path.exists(file_path):
        paper_dataframes.append(pd.read_csv(file_path))

if paper_dataframes:
    merged_papers = pd.concat(paper_dataframes, ignore_index=True)
    
    # Merge with clustering results
    df = df.merge(
        merged_papers[['EID', 'Title', 'Year', 'Abstract', 'Authors']], 
        on='EID', 
        how='left'
    )
    
    print(f"üìÑ Merged paper metadata: {len(merged_papers)} papers")
else:
    print("‚ö†Ô∏è  Paper metadata files not found")

# Save intermediate results
df.to_csv("clustering_intermediate_results.csv", index=False)
print("üíæ Saved intermediate clustering results")

## 5. Generate Cluster Names with LLM

Use GPT-4o-mini to generate descriptive names for each cluster based on representative papers.

In [None]:
def get_representative_papers(cluster_df, n_papers=20):
    """Select most representative papers based on centroid distance and density"""
    
    if len(cluster_df) == 0:
        return []
    
    # Use 3D coordinates for centroid calculation
    coords = cluster_df[['umap_3d_x', 'umap_3d_y', 'umap_3d_z']].values
    centroid = coords.mean(axis=0).reshape(1, -1)
    
    # Calculate distances to centroid
    distances = cdist(coords, centroid, metric='euclidean').flatten()
    
    # Normalize distance and density scores
    dist_min, dist_max = distances.min(), distances.max()
    norm_distances = (distances - dist_min) / (dist_max - dist_min) if dist_max > dist_min else np.zeros_like(distances)
    
    dens_min, dens_max = cluster_df['density_score'].min(), cluster_df['density_score'].max()
    norm_densities = (cluster_df['density_score'] - dens_min) / (dens_max - dens_min) if dens_max > dens_min else np.zeros_like(distances)
    
    # Combine scores (closer to centroid + higher density = better representative)
    representative_scores = (1 - norm_distances) + norm_densities
    
    # Select top papers
    top_indices = representative_scores.argsort()[-n_papers:][::-1]
    return cluster_df.iloc[top_indices]['EID'].tolist()

def generate_cluster_name_description(abstracts, client):
    """Generate cluster name and description using LLM"""
    
    context = "\n---\n".join(abstracts[:20])
    
    prompt = f"""
    Analyze these research paper abstracts from a single cluster:
    
    {context}
    
    Generate:
    1. A concise, technical cluster name (max 9 words)
    2. A one-sentence description of the research scope/methodology
    
    Return only a JSON object: {{"name": "...", "description": "..."}}
    """
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"},
            temperature=0.3
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"LLM error: {e}")
        return {"name": "Unknown Cluster", "description": "Description not available."}

def process_cluster_naming(df):
    """Generate names for all clusters"""
    
    if not OPENAI_API_KEY or OPENAI_API_KEY == "**input your api key":
        print("‚ö†Ô∏è  Please set your OpenAI API key")
        return df
    
    client = openai.OpenAI(api_key=OPENAI_API_KEY)
    
    # Group by cluster
    cluster_groups = df.groupby('5L_Layer_5_Final')
    naming_results = []
    
    print(f"üè∑Ô∏è  Processing {len(cluster_groups)} clusters for naming...")
    
    for cluster_id, cluster_group in tqdm(cluster_groups, desc="Naming clusters"):
        if pd.isna(cluster_id):
            continue
        
        # Get representative papers
        representative_eids = get_representative_papers(cluster_group)
        
        # Extract abstracts
        abstracts = df[df['EID'].isin(representative_eids)]['Abstract'].dropna().tolist()
        
        if abstracts:
            # Generate name and description
            metadata = generate_cluster_name_description(abstracts, client)
            naming_results.append({
                'cluster_id': cluster_id,
                'name': metadata['name'],
                'description': metadata['description']
            })
        
        time.sleep(1.0)  # Rate limiting
    
    # Apply names to dataframe
    naming_df = pd.DataFrame(naming_results)
    name_mapping = naming_df.set_index('cluster_id')['name'].to_dict()
    desc_mapping = naming_df.set_index('cluster_id')['description'].to_dict()
    
    df['cluster_name'] = df['5L_Layer_5_Final'].map(name_mapping)
    df['cluster_description'] = df['5L_Layer_5_Final'].map(desc_mapping)
    
    print(f"‚úÖ Generated names for {len(naming_results)} clusters")
    return df

# Generate cluster names
df = process_cluster_naming(df)

## 6. Save Final Results

In [None]:
# Save complete results
df.to_csv('final_clustered_papers_with_names.csv', index=False)
df.to_pickle('final_clustered_papers_with_names.pkl')

# Generate summary statistics
cluster_stats = df.groupby('cluster_name').agg({
    'EID': 'count',
    'density_score': 'mean',
    'cluster_description': 'first'
}).rename(columns={'EID': 'paper_count'}).sort_values('paper_count', ascending=False)

print("üìä Final Clustering Summary:")
print(f"   üìÑ Total papers processed: {len(df)}")
print(f"   üè∑Ô∏è  Named clusters: {df['cluster_name'].nunique()}")
print(f"   üìà Largest cluster: {cluster_stats['paper_count'].max()} papers")
print(f"   üìâ Average cluster size: {cluster_stats['paper_count'].mean():.1f} papers")

# Save cluster statistics
cluster_stats.to_csv('cluster_statistics.csv')
print("\n‚úÖ All files saved successfully")
print("üìÅ Output files:")
print("   - final_clustered_papers_with_names.csv")
print("   - final_clustered_papers_with_names.pkl")
print("   - cluster_statistics.csv")