In [None]:
import sknetwork as skn
from sknetwork.clustering import Louvain, get_modularity
from sknetwork.linalg import normalize
from sknetwork.utils import get_membership
from sknetwork.visualization import visualize_graph
import networkx as nx  
import torch
from torch_geometric.utils import to_scipy_sparse_matrix, to_networkx
from torch_geometric.data import Data
import os
from tqdm import tqdm
import json
import numpy as np
import pandas as pd
from IPython.display import SVG
from cdlib import evaluation, NodeClustering, algorithms

In [None]:
graph_dir = os.path.join('..','..', 'src', 'data', 'graph_data.pt')
data = torch.load(graph_dir, weights_only=False)
graph = data['data']
mapping = data['idx2user']

# Extracting nodes position
G_nx = to_networkx(graph, to_undirected=True)
communities = algorithms.louvain(G_nx, weight='weight', resolution=1)

In [None]:
# Clusterize community in cluster i
exp_clusters = {}

for k, cluster in enumerate(communities.communities):
    users = [mapping[node_id] for node_id in cluster]
    exp_clusters[k] = users

In [None]:
def build_subgraph(cluster_id, users, edge_df, save_path):
    print(f"Processing cluster : {cluster_id} with {len(users)} users.")
    users = set(users)
    edges = edge_df[edge_df['source'].isin(users) & edge_df['target'].isin(users)]

    if len(edges) == 0:
        return
    
    # Ensure consistent ordering of users
    users = sorted(users)

    # Mapping id - users 
    user2idx = {uid: idx for idx, uid in enumerate(users)}
    idx2user = {idx: uid for uid, idx in user2idx.items()}

    # Creating edges and associated weights
    edges_index = torch.tensor([
        [user2idx[src] for src in edges['source']],
        [user2idx[dst] for dst in edges['target']],
    ], dtype=torch.long)

    edges_weight = torch.tensor(edges['weight'].values, dtype=torch.float)

    # Creating the subgraph aka the k-th cluster
    x = torch.eye(len(user2idx))
    data = Data(x=x, edge_index=edges_index, edge_weight=edges_weight)
    torch.save({
        'data': data,
        'idx2user': idx2user,
    }, save_path)

    return save_path

In [None]:
def get_cluster_with_Louvain(edge_df, subgraph_path, save_dir, parent_dir=None, threshold=0.2, min_users=20):
    subgraph = torch.load(subgraph_path, weights_only=False)
    data = subgraph['data']
    idx2user = subgraph['idx2user']
    
    # Conversion to nx object
    SUB_nx = to_networkx(data, to_undirected=True)

    # Calculating communities using Louvain
    communities = algorithms.louvain(SUB_nx, weight='weight', resolution=1)
    counductances = evaluation.conductance(SUB_nx,communities, summary=False)

    cluster_nodes = {}

    for i, (cluster, score) in enumerate(zip(communities.communities, counductances)):
        users = [idx2user[node] for node in cluster]

        if score < threshold and len(cluster) >= min_users:
            # Build and save subgraph for this cluster
            # Get user ids from node indices (integers)
            true_save_dir = parent_dir if parent_dir else save_dir
            save_path = os.path.join(true_save_dir, f"subgraph_{i}.pt")
            build_subgraph(cluster_id=i, users=users, edge_df=edge_df, save_path=save_path)
            print(f"Added cluster {i} to list. Conductance: {score:.4f}, Users: {len(cluster)}")
        elif score >= threshold and len(cluster) >= min_users:
            cluster_nodes[i] = cluster
            print(f"Keeping cluster {i} for further clusterization. Conductance: {score:.4f}, Users: {len(cluster)}")
        else:
            print(f"Cluster {i} discarded due to conductance and/or size. Conductance: {score:.4f}, Users: {len(cluster)}")

    return cluster_nodes

In [None]:
def recursive_clustering(edge_df, subgraph_path, base_cluster_id, base_output_dir, cluster_tree, depth=0, max_depth = 4):
    # Stopping condition
    if depth >= max_depth:
        return
    
    subgraph = torch.load(subgraph_path, weights_only=False)
    idx2user = subgraph['idx2user']
    
    # Save current cluster info in tree
    cluster_tree[int(base_cluster_id)] = {
        "users" : {str(idx2user[i]) for i in range(len(idx2user))},
        "children" : {}
    }
    child_tree = cluster_tree[int(base_cluster_id)]["children"]

    # Use a dedicated directory for storing sub-subgraphs of this cluster
    cluster_save_dir = os.path.join(base_output_dir, f"subgraph_{base_cluster_id}")
    os.makedirs(cluster_save_dir, exist_ok=True)
    
    new_clusters= get_cluster_with_Louvain(edge_df, subgraph_path, save_dir=cluster_save_dir, parent_dir=base_cluster_id)
    if not new_clusters:
        return

    for sub_id, node_ids in new_clusters.items():
        users = [idx2user[idx] for idx in node_ids]

        cluster_id = f"{base_cluster_id}_{sub_id}"
        save_path = os.path.join(cluster_save_dir, f"subgraph_{cluster_id}.pt")
        
        new_path = build_subgraph(cluster_id, users, edge_df, save_path=save_path)
        if new_path:
            recursive_clustering(
            edge_df=edge_df,
            subgraph_path=new_path,
            base_cluster_id=cluster_id,
            base_output_dir=cluster_save_dir,
            cluster_tree = child_tree,
            depth=depth + 1,
            max_depth=max_depth
        )

In [None]:
# Wrapper to start the recursion
def start_recursive_clustering(first_clusters, edges_path, base_output_dir, max_depth=4):
    edges = pd.read_csv(edges_path)
    cluster_tree = {}

    for k, users in first_clusters.items():
        save_path = os.path.join(base_output_dir, f'subgraph_{k}.pt')
        new_cluster_path = build_subgraph(k, users, edges, save_path)
        if new_cluster_path is not None:
            recursive_clustering(
                subgraph_path=new_cluster_path,
                base_cluster_id=k,
                edge_df=edges,
                base_output_dir=base_output_dir,
                cluster_tree=cluster_tree,
                depth=0,
                max_depth=max_depth
            )

    return cluster_tree

In [None]:
def make_json_serializable(obj):
    if isinstance(obj, dict):
        return {str(k): make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, set):
        return list(obj)
    elif isinstance(obj, list):
        return [make_json_serializable(v) for v in obj]
    else:
        return obj

In [None]:
edges_path = os.path.join('..', '..', 'src', 'data', 'edges.csv')
save_dir = os.path.join('..', '..', 'src', 'graph_dir', 'subgraph_dir')
os.makedirs(save_dir, exist_ok=True)

cluster_tree = start_recursive_clustering(exp_clusters, edges_path, save_dir)
json_safe_tree = make_json_serializable(cluster_tree)

with open(os.path.join(save_dir, 'cluster_tree.json'), 'w', encoding='utf-8') as f:
    json.dump(json_safe_tree, f, indent=4, ensure_ascii=False)