# Traditional Community Detection Methods

This notebook demonstrates traditional community detection methods and evaluates their performance against ground truth communities. We'll cover:

1. Loading previously created graphs
2. Applying various traditional community detection algorithms
3. Evaluating detection results
4. Comparing different methods
5. Visualizing detected communities

In [ ]:
import sys
import os
import numpy as np
import torch
import polars as pl
import rustworkx as rx
import networkx as nx  # Still needed for some visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

# Import directly from community_detection package
from community_detection.traditional_methods import (
    run_louvain, run_leiden, run_infomap, run_label_propagation,
    run_spectral_clustering, run_girvan_newman, evaluate_against_ground_truth,
    plot_communities, add_communities_to_graph, compare_methods, save_communities
)

# Import functions from data_prep for graph generation if needed
from community_detection.data_prep import generate_synthetic_graph

## 1. Load or Generate Test Graphs

First, let's load the synthetic graphs we created in the previous notebook, or generate a new one if needed.

In [None]:
# Create directory for data if it doesn't exist
os.makedirs('data', exist_ok=True)

# Check if previously saved graphs are available
if os.path.exists('data/sbm_graph.gpickle'):
    # Load the SBM graph using pickle format
    print("Loading SBM graph from file...")
    with open('data/sbm_graph.gpickle', 'rb') as f:
        G = pickle.load(f)
else:
    # Generate a new SBM graph
    print("Generating a new SBM graph...")
    n_communities = 5
    G, _ = generate_synthetic_graph(
        'sbm', 
        n_nodes=100, 
        n_communities=n_communities,
        p_in=0.3, 
        p_out=0.05
    )
    
    # Save the graph in pickle format for future use
    with open('data/sbm_graph.gpickle', 'wb') as f:
        pickle.dump(G, f)
    print("Generated and saved new SBM graph.")

# Extract ground truth communities
ground_truth = {}
for i in range(len(G)):
    node_data = G.get_node_data(i)
    if node_data and 'community' in node_data:
        ground_truth[i] = node_data['community']

# Visualize the graph with ground truth communities
plot_communities(G, ground_truth, title="Ground Truth Communities")

## 2. Apply Individual Community Detection Methods

Let's apply various traditional community detection methods one by one and evaluate their performance.

### 2.1 Louvain Method

In [None]:
# Run Louvain algorithm
print("Running Louvain method...")
louvain_communities, louvain_time = run_louvain(G)

# Add detected communities to the graph
G = add_communities_to_graph(G, louvain_communities, attr_name='louvain_community')

# Evaluate against ground truth
louvain_metrics = evaluate_against_ground_truth(G, louvain_communities, 'community')
print(f"Execution time: {louvain_time:.4f} seconds")
print(f"Number of communities detected: {len(set(louvain_communities.values()))}")
print(f"NMI: {louvain_metrics['nmi']:.4f}")
print(f"ARI: {louvain_metrics['ari']:.4f}")
print(f"Modularity: {louvain_metrics['modularity']:.4f}")

# Save results
os.makedirs('results', exist_ok=True)
louvain_result = {
    'communities': louvain_communities,
    'execution_time': louvain_time,
    'metrics': louvain_metrics,
    'num_communities': len(set(louvain_communities.values()))
}
with open('results/louvain_result.pkl', 'wb') as f:
    pickle.dump(louvain_result, f)

# Visualize detected communities
plot_communities(G, louvain_communities, title=f"Louvain Method - {len(set(louvain_communities.values()))} communities")

### 2.2 Label Propagation

In [None]:
# Run Label Propagation algorithm
print("Running Label Propagation method...")
lp_communities, lp_time = run_label_propagation(G)

# Add detected communities to the graph
G = add_communities_to_graph(G, lp_communities, attr_name='lp_community')

# Evaluate against ground truth
lp_metrics = evaluate_against_ground_truth(G, lp_communities, 'community')
print(f"Execution time: {lp_time:.4f} seconds")
print(f"Number of communities detected: {len(set(lp_communities.values()))}")
print(f"NMI: {lp_metrics['nmi']:.4f}")
print(f"ARI: {lp_metrics['ari']:.4f}")
print(f"Modularity: {lp_metrics['modularity']:.4f}")

# Save results
lp_result = {
    'communities': lp_communities,
    'execution_time': lp_time,
    'metrics': lp_metrics,
    'num_communities': len(set(lp_communities.values()))
}
with open('results/label_propagation_result.pkl', 'wb') as f:
    pickle.dump(lp_result, f)

# Visualize detected communities
plot_communities(G, lp_communities, title=f"Label Propagation - {len(set(lp_communities.values()))} communities")

### 2.3 Spectral Clustering

In [None]:
# Get number of ground truth communities
n_communities = len(set(ground_truth.values()))

# Run Spectral Clustering algorithm
print(f"Running Spectral Clustering with {n_communities} clusters...")
spectral_communities, spectral_time = run_spectral_clustering(G, n_communities)

# Add detected communities to the graph
G = add_communities_to_graph(G, spectral_communities, attr_name='spectral_community')

# Evaluate against ground truth
spectral_metrics = evaluate_against_ground_truth(G, spectral_communities, 'community')
print(f"Execution time: {spectral_time:.4f} seconds")
print(f"Number of communities detected: {len(set(spectral_communities.values()))}")
print(f"NMI: {spectral_metrics['nmi']:.4f}")
print(f"ARI: {spectral_metrics['ari']:.4f}")
print(f"Modularity: {spectral_metrics['modularity']:.4f}")

# Save results
spectral_result = {
    'communities': spectral_communities,
    'execution_time': spectral_time,
    'metrics': spectral_metrics,
    'num_communities': len(set(spectral_communities.values()))
}
with open('results/spectral_clustering_result.pkl', 'wb') as f:
    pickle.dump(spectral_result, f)

# Visualize detected communities
plot_communities(G, spectral_communities, 
               title=f"Spectral Clustering - {len(set(spectral_communities.values()))} communities")

### 2.4 Infomap (if available)

In [None]:
# Check if cdlib is available for Infomap
try:
    from cdlib import algorithms
    CDLIB_AVAILABLE = True
except ImportError:
    CDLIB_AVAILABLE = False
    print("cdlib not available. Skipping Infomap.")

if CDLIB_AVAILABLE:
    # Run Infomap algorithm
    print("Running Infomap method...")
    infomap_communities, infomap_time = run_infomap(G)
    
    # Add detected communities to the graph
    G = add_communities_to_graph(G, infomap_communities, attr_name='infomap_community')
    
    # Evaluate against ground truth
    infomap_metrics = evaluate_against_ground_truth(G, infomap_communities, 'community')
    print(f"Execution time: {infomap_time:.4f} seconds")
    print(f"Number of communities detected: {len(set(infomap_communities.values()))}")
    print(f"NMI: {infomap_metrics['nmi']:.4f}")
    print(f"ARI: {infomap_metrics['ari']:.4f}")
    print(f"Modularity: {infomap_metrics['modularity']:.4f}")
    
    # Save results
    infomap_result = {
        'communities': infomap_communities,
        'execution_time': infomap_time,
        'metrics': infomap_metrics,
        'num_communities': len(set(infomap_communities.values()))
    }
    with open('results/infomap_result.pkl', 'wb') as f:
        pickle.dump(infomap_result, f)
    
    # Visualize detected communities
    plot_communities(G, infomap_communities, 
                  title=f"Infomap - {len(set(infomap_communities.values()))} communities")

### 2.5 Leiden (if available)

In [None]:
# Check if cdlib is available for Leiden
if CDLIB_AVAILABLE:
    # Run Leiden algorithm
    print("Running Leiden method...")
    leiden_communities, leiden_time = run_leiden(G)
    
    # Add detected communities to the graph
    G = add_communities_to_graph(G, leiden_communities, attr_name='leiden_community')
    
    # Evaluate against ground truth
    leiden_metrics = evaluate_against_ground_truth(G, leiden_communities, 'community')
    print(f"Execution time: {leiden_time:.4f} seconds")
    print(f"Number of communities detected: {len(set(leiden_communities.values()))}")
    print(f"NMI: {leiden_metrics['nmi']:.4f}")
    print(f"ARI: {leiden_metrics['ari']:.4f}")
    print(f"Modularity: {leiden_metrics['modularity']:.4f}")
    
    # Save results
    leiden_result = {
        'communities': leiden_communities,
        'execution_time': leiden_time,
        'metrics': leiden_metrics,
        'num_communities': len(set(leiden_communities.values()))
    }
    with open('results/leiden_result.pkl', 'wb') as f:
        pickle.dump(leiden_result, f)
    
    # Visualize detected communities
    plot_communities(G, leiden_communities, 
                  title=f"Leiden - {len(set(leiden_communities.values()))} communities")

## 3. Compare All Methods

Now let's run a comparison of all the traditional methods to see how they perform against each other.

In [None]:
# Get number of ground truth communities for spectral clustering
n_clusters = len(set(ground_truth.values()))

# Run comparison of methods
print("Comparing different community detection methods...")
results_df = compare_methods(G, ground_truth_attr='community', n_clusters=n_clusters)

# Display results
print("\nComparison Results:")
print(results_df)

# Save the comparison results
results_df.write_parquet('results/traditional_methods_comparison.parquet', compression="zstd")

## 4. Visualize Comparison Results

Let's create some visualizations to better understand the comparison results.

In [None]:
# Convert polars DataFrame to pandas for visualization
results_pd = results_df.to_pandas()

# Visualize NMI and ARI
plt.figure(figsize=(12, 6))

# Plot NMI and ARI
plt.subplot(1, 2, 1)
results_pd.plot(x='Method', y=['NMI', 'ARI'], kind='bar', ax=plt.gca())
plt.title('Quality Metrics by Method')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.grid(alpha=0.3)

# Plot execution time
plt.subplot(1, 2, 2)
results_pd.plot(x='Method', y='Execution Time (s)', kind='bar', ax=plt.gca(), color='green')
plt.title('Execution Time by Method')
plt.ylabel('Time (seconds)')
plt.xticks(rotation=45)
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot of NMI vs. Execution Time
plt.figure(figsize=(10, 6))
plt.scatter(results_pd['Execution Time (s)'], results_pd['NMI'], 
           s=results_pd['Num Communities']*20, alpha=0.7)

# Add method names as annotations
for _, row in results_pd.iterrows():
    plt.annotate(row['Method'], 
                (row['Execution Time (s)'], row['NMI']),
                textcoords="offset points", 
                xytext=(0,10), 
                ha='center')

plt.title('NMI vs. Execution Time')
plt.xlabel('Execution Time (seconds)')
plt.ylabel('NMI')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Summary and Conclusions

In this notebook, we have:

1. Applied various traditional community detection methods to our test graph
   - Louvain
   - Label Propagation
   - Spectral Clustering
   - Infomap (if available)
   - Leiden (if available)
2. Evaluated each method using NMI, ARI, and modularity
3. Compared the methods in terms of quality and performance
4. Visualized the detected communities and comparison results

Based on the results, we can see which methods perform best for our test graph in terms of both accuracy and computational efficiency. This information will be valuable for comparing with the GNN-based approaches in the next notebooks.