In [11]:
import sknetwork as skn
from sknetwork.clustering import Louvain, get_modularity
from sknetwork.linalg import normalize
from sknetwork.utils import get_membership
from sknetwork.visualization import visualize_graph
import networkx as nx  
import torch
from torch_geometric.utils import to_scipy_sparse_matrix, to_networkx
from torch_geometric.data import Data
import os
from tqdm import tqdm
import json
import numpy as np
import pandas as pd
from IPython.display import SVG
from cdlib import evaluation, NodeClustering, algorithms

In [12]:
def build_subgraph(cluster_id, users, edge_df, save_path):
    print(f"Processing cluster : {cluster_id} with {len(users)} users.")
    users = set(users)
    edges = edge_df[edge_df['source'].isin(users) & edge_df['target'].isin(users)]

    if len(edges) == 0:
        return
    
    # Ensure consistent ordering of users
    users = sorted(users)

    # Mapping id - users 
    user2idx = {uid: idx for idx, uid in enumerate(users)}
    idx2user = {idx: uid for uid, idx in user2idx.items()}

    # Creating edges and associated weights
    edges_index = torch.tensor([
        [user2idx[src] for src in edges['source']],
        [user2idx[dst] for dst in edges['target']],
    ], dtype=torch.long)

    edges_weight = torch.tensor(edges['weight'].values, dtype=torch.float)

    # Creating the subgraph aka the k-th cluster
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    x = torch.eye(len(user2idx))
    data = Data(x=x, edge_index=edges_index, edge_weight=edges_weight)
    torch.save({
        'data': data,
        'idx2user': idx2user,
    }, save_path)

    return save_path

In [13]:
# Creating dirt cluster from the global graph
edges_path = os.path.join('..', '..', 'src', 'data', 'edges.csv')
edges_df = pd.read_csv(edges_path)

save_dir = os.path.join('..', '..', 'src', 'graph_dir', 'infomap_dir')
os.makedirs(save_dir, exist_ok=True)

graph_dir = os.path.join('..','..', 'src', 'data', 'graph_data.pt')

data = torch.load(graph_dir, weights_only=False)
graph = data['data']
mapping = data['idx2user']

# Extracting nodes position
G_nx = to_networkx(graph, to_undirected=True)

communities = algorithms.infomap(G_nx)
len(communities.communities)

172

In [14]:
conductances = evaluation.conductance(G_nx, communities, summary=False)
conductances

[0.08076725214962886,
 0.4122350090854028,
 0.35877862595419846,
 0.5621890547263682,
 0.6108843537414966,
 0.71671388101983,
 0.5690866510538641,
 0.5721925133689839,
 0.6374269005847953,
 0.690677966101695,
 0.6545961002785515,
 0.3202614379084967,
 0.7089201877934272,
 0.6501305483028721,
 0.6379821958456974,
 0.5335968379446641,
 0.3333333333333333,
 0.7266666666666667,
 0.6355140186915887,
 0.312,
 0.6967213114754098,
 0.7053571428571429,
 0.6257309941520468,
 0.6022099447513812,
 0.6987951807228916,
 0.6347826086956522,
 0.5333333333333333,
 0.4157303370786517,
 0.7272727272727273,
 0.5806451612903226,
 0.7461928934010152,
 0.5689655172413793,
 0.4146341463414634,
 0.4177215189873418,
 0.6323529411764706,
 0.6949152542372882,
 0.712,
 0.6181818181818182,
 0.6730769230769231,
 0.6869565217391305,
 0.7227722772277227,
 0.5581395348837209,
 0.7099236641221374,
 0.5058823529411764,
 0.7142857142857143,
 0.7185185185185186,
 0.719626168224299,
 0.751937984496124,
 0.6981132075471698,


In [15]:
# Explicit users for each found cluster and evaluate their conductance
exp_clusters = {}
cluster_tree_base = {}
conductance_threshold = 0.5
min_users = 10
strong_comm, noisy_comm, weak_comm = 0, 0, 0

# Order and classify clusters into three categories based on their conductance and number of users
for k, (cluster, score) in tqdm(enumerate(zip(communities.communities, conductances))):
    if len(cluster) >= min_users:
        if score < conductance_threshold:
            comm_type = "Strong community"
            strong_comm += 1
        else:
            comm_type = "Weak community"
            weak_comm += 1

        users = [mapping[node_id] for node_id in cluster]

        cluster_tree_base[str(k)] = {
            "users": [str(u) for u in users],
            "type": comm_type,
            "conductance": score,
            "num_users": len(cluster)
        }

        cluster_dir = os.path.join(save_dir, f'cluster_{k}')
        os.makedirs(cluster_dir, exist_ok=True)
        cluster_path = os.path.join(cluster_dir, f'cluster_{k}.pt')
        build_subgraph(cluster_id=k, users=users, edge_df=edges_df, save_path=cluster_path)
    else:
        noisy_comm += 1
        cluster_tree_base[str(k)] = {
            "users": [str(mapping[node_id]) for node_id in cluster],
            "type": "Noisy community",
            "conductance": score,
            "num_users": len(cluster)
        }

# Save cluster info to JSON
with open(os.path.join(save_dir, 'cluster_tree_base.json'), 'w', encoding='utf-8') as f:
    json.dump(cluster_tree_base, f, indent=4, ensure_ascii=False)

print(f"Total communities found by InfoMap algorithm: {len(communities.communities)}.\n"
      f"Total strong communities: {strong_comm}\n"
      f"Total weak communities: {weak_comm}\n"
      f"Total noisy communities: {noisy_comm}")

42it [00:00, 209.14it/s]

Processing cluster : 0 with 836 users.
Processing cluster : 1 with 498 users.
Processing cluster : 2 with 67 users.
Processing cluster : 3 with 56 users.
Processing cluster : 4 with 55 users.
Processing cluster : 5 with 46 users.
Processing cluster : 6 with 43 users.
Processing cluster : 7 with 36 users.
Processing cluster : 8 with 33 users.
Processing cluster : 9 with 33 users.
Processing cluster : 10 with 33 users.
Processing cluster : 11 with 32 users.
Processing cluster : 12 with 31 users.
Processing cluster : 13 with 30 users.
Processing cluster : 14 with 28 users.
Processing cluster : 15 with 27 users.
Processing cluster : 16 with 23 users.
Processing cluster : 17 with 22 users.
Processing cluster : 18 with 22 users.
Processing cluster : 19 with 22 users.
Processing cluster : 20 with 21 users.
Processing cluster : 21 with 21 users.
Processing cluster : 22 with 20 users.
Processing cluster : 23 with 18 users.
Processing cluster : 24 with 18 users.
Processing cluster : 25 with 18 u

172it [00:00, 466.37it/s]

Processing cluster : 44 with 13 users.
Processing cluster : 45 with 13 users.
Processing cluster : 46 with 13 users.
Processing cluster : 47 with 12 users.
Processing cluster : 48 with 12 users.
Processing cluster : 49 with 12 users.
Processing cluster : 50 with 12 users.
Processing cluster : 51 with 12 users.
Processing cluster : 52 with 12 users.
Processing cluster : 53 with 12 users.
Processing cluster : 54 with 11 users.
Processing cluster : 55 with 11 users.
Processing cluster : 56 with 11 users.
Processing cluster : 57 with 11 users.
Processing cluster : 58 with 11 users.
Processing cluster : 59 with 11 users.
Processing cluster : 60 with 11 users.
Processing cluster : 61 with 11 users.
Processing cluster : 62 with 11 users.
Processing cluster : 63 with 11 users.
Processing cluster : 64 with 11 users.
Processing cluster : 65 with 10 users.
Processing cluster : 66 with 10 users.
Processing cluster : 67 with 10 users.
Processing cluster : 68 with 10 users.
Processing cluster : 69 w


