In [28]:
import os
import json
import torch
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
from cdlib import evaluation, algorithms
from torch_geometric.utils import to_networkx
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [29]:
# Additional metrics
def compute_internal_density(G, nodes):
    subG = G.subgraph(nodes)
    n = len(subG.nodes())
    m = len(subG.edges())
    if n <= 1:
        return 0.0
    return (2 * m) / (n * (n - 1))

def compute_edges_cut(G, nodes):
    sub_nodes = set(nodes)
    edges_inside, edges_cut = 0, 0
    for u in nodes:
        for v in G.neighbors(u):
            if v in sub_nodes:
                edges_inside += 1
            else:
                edges_cut += 1
    edges_inside = edges_inside // 2 
    return edges_inside, edges_cut

In [30]:
# Load data
edges_path = os.path.join('..', '..', 'src', 'data', 'edges.csv')
edges_df = pd.read_csv(edges_path)

save_dir = os.path.join('..', '..', 'src', 'graph_dir', 'infomap_dir')
os.makedirs(save_dir, exist_ok=True)

graph_dir = os.path.join('..','..', 'src', 'data', 'graph_data.pt')
data = torch.load(graph_dir, weights_only=False)
graph = data['data']
mapping = data['idx2user']

# Extracting nodes position
G_nx = to_networkx(graph, to_undirected=True)

# Community detection with Infomap
communities = algorithms.infomap(G_nx)
print(f"Detected {len(communities.communities)} communities with Infomap")

# Global metrics
modularity_score = evaluation.newman_girvan_modularity(G_nx, communities).score
conductances = evaluation.conductance(G_nx, communities, summary=False)

Detected 37 communities with Infomap


In [31]:
records = []
for k, (cluster, cond_score) in enumerate(zip(communities.communities, conductances)):
    num_users = len(cluster)
    density = compute_internal_density(G_nx, cluster)
    edges_in, edges_cut = compute_edges_cut(G_nx, cluster)

    # Modularity gain (cdlib fornisce score complessivo ma possiamo stimare locale)
    modularity_gain = edges_in / (edges_in + edges_cut + 1e-9)

    records.append({
        "id": k,
        "users": [mapping[node_id] for node_id in cluster],
        "num_users": num_users,
        "density": density,
        "conductance": cond_score,
        "modularity_gain": modularity_gain,
        "edges_inside": edges_in,
        "edges_cut": edges_cut
    })

df = pd.DataFrame(records)

scaler = StandardScaler()
features = df[["density", "num_users", "modularity_gain", "conductance"]].copy()
features["inv_conductance"] = 1 - features["conductance"]
features = features.drop(columns="conductance")

df["inv_conductance"] = 1 - df["conductance"]

X = scaler.fit_transform(features)

# Automatic clustering based on score metric
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)

df["cluster_score"] = kmeans.transform(X).min(axis=1)
df["category"] = labels

cat_map = {
    0: "Weak",
    1: "Moderate",
    2: "Strong",
    3: "Very Strong"
}

order = df.groupby("category")["inv_conductance"].mean().sort_values().index
cat_map = {cat: name for cat, name in zip(order, ["Weak", "Moderate", "Strong", "Very Strong"])}

df["category_name"] = df["category"].map(cat_map)

In [32]:
# Save cluster info to JSON
cluster_tree_base = {}
for row in df.to_dict("records"):
    cluster_tree_base[str(row["id"])] = {
        "users": [str(u) for u in row["users"]],
        "type": row["category_name"],
        "conductance": float(row["conductance"]),
        "internal_density": float(row["density"]),
        "edges_inside": int(row["edges_inside"]),
        "edges_cut": int(row["edges_cut"]),
        "modularity_gain": float(row["modularity_gain"]),
        "num_users": int(row["num_users"]),
        "cluster_score": float(row["cluster_score"])
    }

with open(os.path.join(save_dir, 'cluster_tree_base.json'), 'w', encoding='utf-8') as f:
    json.dump(cluster_tree_base, f, indent=4, ensure_ascii=False)

print(df.groupby("category_name").size())

category_name
Moderate       25
Strong          4
Very Strong     1
Weak            7
dtype: int64
