In [1]:
import sknetwork as skn
from sknetwork.clustering import Louvain, get_modularity
from sknetwork.linalg import normalize
from sknetwork.utils import get_membership
from sknetwork.visualization import visualize_graph
import networkx as nx  
import torch
from torch_geometric.utils import to_scipy_sparse_matrix, to_networkx
from torch_geometric.data import Data
import os
from tqdm import tqdm
import json
import numpy as np
import pandas as pd
from IPython.display import SVG
from cdlib import evaluation, NodeClustering, algorithms

  from .autonotebook import tqdm as notebook_tqdm


Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'bayanpy', 'infomap', 'wurlitzer'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'pyclustering', 'ASLPAw'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'infomap', 'wurlitzer'}


In [2]:
def build_subgraph(cluster_id, users, edge_df, save_path):
    print(f"Processing cluster : {cluster_id} with {len(users)} users.")
    users = set(users)
    edges = edge_df[edge_df['source'].isin(users) & edge_df['target'].isin(users)]

    if len(edges) == 0:
        return
    
    # Ensure consistent ordering of users
    users = sorted(users)

    # Mapping id - users 
    user2idx = {uid: idx for idx, uid in enumerate(users)}
    idx2user = {idx: uid for uid, idx in user2idx.items()}

    # Creating edges and associated weights
    edges_index = torch.tensor([
        [user2idx[src] for src in edges['source']],
        [user2idx[dst] for dst in edges['target']],
    ], dtype=torch.long)

    edges_weight = torch.tensor(edges['weight'].values, dtype=torch.float)

    # Creating the subgraph aka the k-th cluster
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    x = torch.eye(len(user2idx))
    data = Data(x=x, edge_index=edges_index, edge_weight=edges_weight)
    torch.save({
        'data': data,
        'idx2user': idx2user,
    }, save_path)

    return save_path

In [3]:
def get_cluster_with_Louvain(edge_df, subgraph_path, save_dir, threshold=0.2, min_users=20, base_cluster_id=None):
    subgraph = torch.load(subgraph_path, weights_only=False)
    data = subgraph['data']
    idx2user = subgraph['idx2user']
    
    # Conversion to nx object
    SUB_nx = to_networkx(data, to_undirected=True)

    # Calculating communities using Louvain
    communities = algorithms.louvain(SUB_nx, weight='weight', resolution=1)
    conductances = evaluation.conductance(SUB_nx,communities, summary=False)

    cluster_nodes = {}

    for i, (cluster, score) in enumerate(zip(communities.communities, conductances)):
        users = [idx2user[node] for node in cluster]

        full_id = f"{base_cluster_id}_{i}" if base_cluster_id is not None else str(i)
        save_path = os.path.join(save_dir, f"subgraph_{full_id}.pt")

        if score < threshold and len(cluster) >= min_users:
            build_subgraph(full_id, users, edge_df, save_path=save_path)
            print(f"Added cluster {i} to list. Conductance: {score:.4f}, Users: {len(cluster)}")
        elif score >= threshold and len(cluster) >= min_users:
            cluster_nodes[i] = cluster
            print(f"Keeping cluster {i} for further clusterization. Conductance: {score:.4f}, Users: {len(cluster)}")
        else:
            print(f"Cluster {i} discarded due to conductance and/or size. Conductance: {score:.4f}, Users: {len(cluster)}")

    return cluster_nodes

In [4]:
def recursive_clustering(edge_df, subgraph_path, base_cluster_id, base_output_dir, cluster_tree, depth=0, max_depth = 3):
    # Stopping condition
    if depth >= max_depth:
        return
    
    subgraph = torch.load(subgraph_path, weights_only=False)
    idx2user = subgraph['idx2user']
    
    # Save current cluster info in tree
    cluster_tree[str(base_cluster_id)] = {
        "users" : [str(idx2user[i]) for i in range(len(idx2user))],
        "children" : {}
    }
    child_tree = cluster_tree[str(base_cluster_id)]["children"]
    
    new_clusters= get_cluster_with_Louvain(
        edge_df,
        subgraph_path,
        save_dir=base_output_dir,
        base_cluster_id=base_cluster_id
    )

    if not new_clusters: #No more cluster to explore
        return

    for sub_id, node_ids in new_clusters.items():
        users = [idx2user[idx] for idx in node_ids]
        cluster_id = f"{base_cluster_id}_{sub_id}"

        # Saving all .pt in the same directory
        save_path = os.path.join(base_output_dir, f"subgraph_{cluster_id}.pt")
        new_path = build_subgraph(cluster_id, users, edge_df, save_path=save_path)

        if new_path:
            recursive_clustering(
            edge_df=edge_df,
            subgraph_path=new_path,
            base_cluster_id=cluster_id,
            base_output_dir=base_output_dir,
            cluster_tree = child_tree,
            depth=depth + 1,
            max_depth=max_depth
        )

In [5]:
# Wrapper to start the recursion
def start_recursive_clustering(first_clusters, edges_path, base_output_dir, max_depth=3):
    edges = pd.read_csv(edges_path)
    cluster_tree = {}

    for k, users in first_clusters.items():
        cluster_dir = os.path.join(base_output_dir, f'cluster_{k}')
        os.makedirs(cluster_dir, exist_ok=True)

        save_path = os.path.join(cluster_dir, f'subgraph_{k}.pt')
        new_cluster_path = build_subgraph(k, users, edges, save_path)
        
        if new_cluster_path is not None:
            recursive_clustering(
                subgraph_path=new_cluster_path,
                base_cluster_id=k,
                edge_df=edges,
                base_output_dir=cluster_dir,
                cluster_tree=cluster_tree,
                depth=0,
                max_depth=max_depth
            )

    return cluster_tree

In [6]:
# Creating dirt cluster from the global graph
edges_path = os.path.join('..', '..', 'src', 'data', 'edges.csv')
edges_df = pd.read_csv(edges_path)

save_dir = os.path.join('..', '..', 'src', 'graph_dir', 'subgraph_dir')
os.makedirs(save_dir, exist_ok=True)

graph_dir = os.path.join('..','..', 'src', 'data', 'graph_data.pt')

data = torch.load(graph_dir, weights_only=False)
graph = data['data']
mapping = data['idx2user']

# Extracting nodes position
G_nx = to_networkx(graph, to_undirected=True)
communities = algorithms.louvain(G_nx, weight='weight', resolution=1)
len(communities.communities)

16

In [7]:
conductances = evaluation.conductance(G_nx, communities, summary=False)
conductances

[0.3199488957839022,
 0.3451669272509887,
 0.0794683257918552,
 0.09359605911330049,
 0.3024054982817869,
 0.5356511490866235,
 0.5209302325581395,
 0.4246575342465753,
 0.2727272727272727,
 0.25,
 0.2,
 0.0,
 0.5,
 0.0,
 0.0,
 0.0]

In [8]:
# Expliciting users for each fouded cluster and evaluating theri conducance
exp_clusters = {}
cluster_tree_base = {}
conductance_threshold = 0.25
min_users = 10

for k, (cluster, score) in enumerate(zip(communities.communities, conductances)):
    # If the cluster meets the criteria -> saves it directly
    if score < conductance_threshold and len(cluster) >= min_users:
        print(f"Saving Cluster {k}: Conductance : {score} - Total Users : {len(cluster)}")

        cluster_tree_base[str(k)] = {
        "users" : [str(mapping[i]) for i in range(len(mapping))]
        }

        cluster_dir = os.path.join(save_dir, f'cluster_{k}')
        os.makedirs(cluster_dir, exist_ok=True)
        cluster_path = os.path.join(cluster_dir, f'cluster_{k}.pt')

        users = [mapping[node_id] for node_id in cluster]
        build_subgraph(cluster_id=k, users=users, edge_df=edges_df, save_path=cluster_path)

    # Else if the cluster overcome the maximum conductance threshold but has more then min_users users -> saves for futher clusterization
    elif score > conductance_threshold and len(cluster) >= min_users:
        print(f"Collecting Cluster {k}: Conductance : {score} - Total Users : {len(cluster)}")
        users = [mapping[node_id] for node_id in cluster]
        exp_clusters[k] = users
    # Else if cluster doesnt meets any criteria -> removes nodes from the analysis -> noise
    else:
        print(f"Deleting Cluster {k}: Conductance : {score} - Total Users : {len(cluster)}")

with open(os.path.join(save_dir, 'cluster_tree_base.json'), 'w', encoding='utf-8') as f:
    json.dump(cluster_tree_base, f, indent=4, ensure_ascii=False)

Collecting Cluster 0: Conductance : 0.3199488957839022 - Total Users : 1065
Collecting Cluster 1: Conductance : 0.3451669272509887 - Total Users : 1060
Saving Cluster 2: Conductance : 0.0794683257918552 - Total Users : 920
Processing cluster : 2 with 920 users.
Saving Cluster 3: Conductance : 0.09359605911330049 - Total Users : 503
Processing cluster : 3 with 503 users.
Collecting Cluster 4: Conductance : 0.3024054982817869 - Total Users : 345
Collecting Cluster 5: Conductance : 0.5356511490866235 - Total Users : 209
Collecting Cluster 6: Conductance : 0.5209302325581395 - Total Users : 78
Collecting Cluster 7: Conductance : 0.4246575342465753 - Total Users : 48
Deleting Cluster 8: Conductance : 0.2727272727272727 - Total Users : 5
Deleting Cluster 9: Conductance : 0.25 - Total Users : 4
Deleting Cluster 10: Conductance : 0.2 - Total Users : 3
Deleting Cluster 11: Conductance : 0.0 - Total Users : 2
Deleting Cluster 12: Conductance : 0.5 - Total Users : 2
Deleting Cluster 13: Conductan

In [9]:
# Recursively creating sub-scluster form master cluster
cluster_tree = start_recursive_clustering(exp_clusters, edges_path, save_dir)

with open(os.path.join(save_dir, 'cluster_tree.json'), 'w', encoding='utf-8') as f:
    json.dump(cluster_tree, f, indent=4, ensure_ascii=False)

Processing cluster : 0 with 1065 users.
Keeping cluster 0 for further clusterization. Conductance: 0.3812, Users: 146
Keeping cluster 1 for further clusterization. Conductance: 0.5034, Users: 114
Keeping cluster 2 for further clusterization. Conductance: 0.4937, Users: 95
Keeping cluster 3 for further clusterization. Conductance: 0.5167, Users: 90
Keeping cluster 4 for further clusterization. Conductance: 0.4892, Users: 75
Keeping cluster 5 for further clusterization. Conductance: 0.4912, Users: 74
Keeping cluster 6 for further clusterization. Conductance: 0.4874, Users: 73
Keeping cluster 7 for further clusterization. Conductance: 0.5058, Users: 73
Keeping cluster 8 for further clusterization. Conductance: 0.5624, Users: 68
Keeping cluster 9 for further clusterization. Conductance: 0.4341, Users: 65
Keeping cluster 10 for further clusterization. Conductance: 0.4209, Users: 60
Keeping cluster 11 for further clusterization. Conductance: 0.4929, Users: 58
Keeping cluster 12 for further c