In [42]:
import sknetwork as skn
from sknetwork.clustering import Louvain, get_modularity
from sknetwork.linalg import normalize
from sknetwork.utils import get_membership
from sknetwork.visualization import visualize_graph
import networkx as nx  
import torch
from torch_geometric.utils import to_scipy_sparse_matrix, to_networkx
from torch_geometric.data import Data
import os
from tqdm import tqdm
import json
import numpy as np
import pandas as pd
from IPython.display import SVG
from cdlib import evaluation, NodeClustering, algorithms

In [43]:
def build_subgraph(cluster_id, users, edge_df, save_path):
    print(f"Processing cluster : {cluster_id} with {len(users)} users.")
    users = set(users)
    edges = edge_df[edge_df['source'].isin(users) & edge_df['target'].isin(users)]

    if len(edges) == 0:
        return
    
    # Ensure consistent ordering of users
    users = sorted(users)

    # Mapping id - users 
    user2idx = {uid: idx for idx, uid in enumerate(users)}
    idx2user = {idx: uid for uid, idx in user2idx.items()}

    # Creating edges and associated weights
    edges_index = torch.tensor([
        [user2idx[src] for src in edges['source']],
        [user2idx[dst] for dst in edges['target']],
    ], dtype=torch.long)

    edges_weight = torch.tensor(edges['weight'].values, dtype=torch.float)

    # Creating the subgraph aka the k-th cluster
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    x = torch.eye(len(user2idx))
    data = Data(x=x, edge_index=edges_index, edge_weight=edges_weight)
    torch.save({
        'data': data,
        'idx2user': idx2user,
    }, save_path)

    return save_path

In [44]:
# Creating dirt cluster from the global graph
edges_path = os.path.join('..', '..', 'src', 'data', 'edges.csv')
edges_df = pd.read_csv(edges_path)

save_dir = os.path.join('..', '..', 'src', 'graph_dir', 'subgraph_dir')
os.makedirs(save_dir, exist_ok=True)

graph_dir = os.path.join('..','..', 'src', 'data', 'graph_data.pt')

data = torch.load(graph_dir, weights_only=False)
graph = data['data']
mapping = data['idx2user']

# Extracting nodes position
G_nx = to_networkx(graph, to_undirected=True)

communities = algorithms.infomap(G_nx)
len(communities.communities)

258

In [45]:
conductances = evaluation.conductance(G_nx, communities, summary=False)
conductances

[0.08271435782793683,
 0.395920877318848,
 0.33437990580847726,
 0.5794392523364486,
 0.7017543859649122,
 0.6063218390804598,
 0.5951417004048583,
 0.5529411764705883,
 0.46540880503144655,
 0.6391752577319587,
 0.6273972602739726,
 0.6710526315789473,
 0.656441717791411,
 0.358974358974359,
 0.6509090909090909,
 0.6190476190476191,
 0.40350877192982454,
 0.6401673640167364,
 0.6613756613756614,
 0.46534653465346537,
 0.5671641791044776,
 0.6024844720496895,
 0.6020942408376964,
 0.46788990825688076,
 0.6424581005586593,
 0.7551020408163265,
 0.6685714285714286,
 0.6178343949044586,
 0.40229885057471265,
 0.6546762589928058,
 0.4945054945054945,
 0.47619047619047616,
 0.6434108527131783,
 0.28,
 0.45,
 0.4339622641509434,
 0.5726495726495726,
 0.6923076923076923,
 0.6907216494845361,
 0.7222222222222222,
 0.6974789915966386,
 0.6122448979591837,
 0.7166666666666667,
 0.6216216216216216,
 0.5578947368421052,
 0.3333333333333333,
 0.41935483870967744,
 0.41379310344827586,
 0.7142857142

In [46]:
# Explicit users for each found cluster and evaluate their conductance
exp_clusters = {}
cluster_tree_base = {}
conductance_threshold = 0.5
min_users = 10
strong_comm, noisy_comm, weak_comm = 0, 0, 0

# Order and classify clusters into three categories based on their conductance and number of users
for k, (cluster, score) in tqdm(enumerate(zip(communities.communities, conductances))):
    if len(cluster) >= min_users:
        if score < conductance_threshold:
            comm_type = "Strong community"
            strong_comm += 1
        else:
            comm_type = "Weak community"
            weak_comm += 1

        users = [mapping[node_id] for node_id in cluster]

        cluster_tree_base[str(k)] = {
            "users": [str(u) for u in users],
            "type": comm_type,
            "conductance": score,
            "num_users": len(cluster)
        }

        cluster_dir = os.path.join(save_dir, f'cluster_{k}')
        os.makedirs(cluster_dir, exist_ok=True)
        cluster_path = os.path.join(cluster_dir, f'cluster_{k}.pt')
        build_subgraph(cluster_id=k, users=users, edge_df=edges_df, save_path=cluster_path)
    else:
        noisy_comm += 1
        cluster_tree_base[str(k)] = {
            "users": [str(mapping[node_id]) for node_id in cluster],
            "type": "Noisy community",
            "conductance": score,
            "num_users": len(cluster)
        }

# Save cluster info to JSON
with open(os.path.join(save_dir, 'cluster_tree_base.json'), 'w', encoding='utf-8') as f:
    json.dump(cluster_tree_base, f, indent=4, ensure_ascii=False)

print(f"Total communities found by InfoMap algorithm: {len(communities.communities)}.\n"
      f"Total strong communities: {strong_comm}\n"
      f"Total noisy communities: {noisy_comm}\n"
      f"Total weak communities: {weak_comm}")

28it [00:00, 141.15it/s]

Processing cluster : 0 with 888 users.
Processing cluster : 1 with 606 users.
Processing cluster : 2 with 89 users.
Processing cluster : 3 with 76 users.
Processing cluster : 4 with 55 users.
Processing cluster : 5 with 54 users.
Processing cluster : 6 with 47 users.
Processing cluster : 7 with 45 users.
Processing cluster : 8 with 45 users.
Processing cluster : 9 with 38 users.
Processing cluster : 10 with 35 users.
Processing cluster : 11 with 32 users.
Processing cluster : 12 with 30 users.
Processing cluster : 13 with 26 users.
Processing cluster : 14 with 26 users.
Processing cluster : 15 with 25 users.
Processing cluster : 16 with 24 users.
Processing cluster : 17 with 23 users.
Processing cluster : 18 with 22 users.
Processing cluster : 19 with 22 users.
Processing cluster : 20 with 21 users.
Processing cluster : 21 with 21 users.
Processing cluster : 22 with 21 users.
Processing cluster : 23 with 19 users.
Processing cluster : 24 with 19 users.
Processing cluster : 25 with 19 u

85it [00:00, 234.00it/s]

Processing cluster : 29 with 17 users.
Processing cluster : 30 with 17 users.
Processing cluster : 31 with 17 users.
Processing cluster : 32 with 16 users.
Processing cluster : 33 with 16 users.
Processing cluster : 34 with 16 users.
Processing cluster : 35 with 15 users.
Processing cluster : 36 with 14 users.
Processing cluster : 37 with 14 users.
Processing cluster : 38 with 14 users.
Processing cluster : 39 with 14 users.
Processing cluster : 40 with 14 users.
Processing cluster : 41 with 14 users.
Processing cluster : 42 with 14 users.
Processing cluster : 43 with 14 users.
Processing cluster : 44 with 14 users.
Processing cluster : 45 with 14 users.
Processing cluster : 46 with 14 users.
Processing cluster : 47 with 14 users.
Processing cluster : 48 with 13 users.
Processing cluster : 49 with 13 users.
Processing cluster : 50 with 13 users.
Processing cluster : 51 with 13 users.
Processing cluster : 52 with 13 users.
Processing cluster : 53 with 12 users.
Processing cluster : 54 w

258it [00:00, 572.00it/s]

Processing cluster : 87 with 10 users.
Processing cluster : 88 with 10 users.
Processing cluster : 89 with 10 users.
Processing cluster : 90 with 10 users.
Processing cluster : 91 with 10 users.
Processing cluster : 92 with 10 users.
Processing cluster : 93 with 10 users.
Processing cluster : 94 with 10 users.
Processing cluster : 95 with 10 users.
Total communities found by InfoMap algorithm: 258.
Total strong communities: 34
Total noisy communities: 162
Total weak communities: 62



