In [1]:
import sknetwork as skn
from sknetwork.clustering import Louvain, get_modularity
from sknetwork.linalg import normalize
from sknetwork.utils import get_membership
from sknetwork.visualization import visualize_graph
import networkx as nx  
import torch
from torch_geometric.utils import to_scipy_sparse_matrix, to_networkx
from torch_geometric.data import Data
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
from IPython.display import SVG

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
graph_dir = os.path.join('..','..', 'src', 'data', 'graph_data.pt')
graph = torch.load(graph_dir, weights_only=False)
data = graph['data']

# Extracting adjacency matrix from graph torch file
adjacency_matrix = to_scipy_sparse_matrix(data.edge_index)

# Extracting nodes position
G_nx = to_networkx(data, to_undirected=False)
pos = nx.spring_layout(G_nx)
positions = np.array([pos[i] for i in range(len(pos))])

# Clusterizing the graph
louvain = Louvain()
labels = louvain.fit_predict(adjacency_matrix)
labels_unique, counts = np.unique(labels, return_counts=True)
print(labels_unique, counts)

In [6]:
get_modularity(adjacency_matrix, labels)

np.float64(0.5210337850402389)

In [19]:
adjacency_aggregate = louvain.aggregate_
avg = normalize(get_membership(labels).T)
position_aggregate = avg.dot(positions)
labels_unique, counts = np.unique(labels, return_counts=True)

In [9]:
unique_clusters = np.unique(labels)

cluster_nodes = {}
for c in unique_clusters:
    cluster = np.where(labels == c)[0]
    if len(cluster) > 5: # If there are at least 10 users in the same cluser, preserves it
        cluster_nodes[c] = cluster

In [10]:
# Clusterize community in cluster i
mapping = graph['idx2user']
exp_clusters = {}

for k, cluster in cluster_nodes.items():
    users = []
    for node_id in cluster:
        username = mapping.get(node_id)
        users.append(username)

    exp_clusters[k] = users

In [11]:
def build_subgraph(cluster_id, users, edge_df, save_path):
    print(f"Processing cluster : {cluster_id} with {len(users)} users.")
    users = set(users)
    edges = edge_df[edge_df['source'].isin(users) & edge_df['target'].isin(users)]

    if len(edges) == 0:
        return

    # Mapping id - users 
    user2idx = {uid: idx for idx, uid in enumerate(users)}
    idx2user = {idx: uid for uid, idx in user2idx.items()}

    # Creating edges and associated weights
    edges_index = torch.tensor([
        [user2idx[src] for src in edges['source']],
        [user2idx[dst] for dst in edges['target']],
    ], dtype=torch.long)

    edges_weight = torch.tensor(edges['weight'].values, dtype=torch.float)

    # Creating the subgraph aka the k-th cluster
    x = torch.eye(len(user2idx))
    data = Data(x=x, edge_index=edges_index, edge_weight=edges_weight)
    torch.save({
        'data': data,
        'idx2user': idx2user,
    }, save_path)

In [None]:
graph_dir = os.path.join('..','..', 'src', 'data', 'graph_data.pt')

def get_cluster_with_Louvain(subgraph_dir, threshold=0.5, min_users=5):
    subgraph = torch.load(subgraph_dir, weights_only=False)
    data = subgraph['data']

    # Extracting adjacency matrix of the subgraph
    adjacency_matrix = to_scipy_sparse_matrix(data.edge_index)

    louvain = Louvain()
    labels = louvain.fit_predict(adjacency_matrix)

    modularity = get_modularity(adjacency_matrix, labels)

    # This means that nodes are cohesive so there's no need to cluterize the subgraph more
    if modularity < threshold:
        return None
    
    unique_clusters = np.unique(labels)
    cluster_nodes = {}

    for c in unique_clusters:
        cluster = np.where(labels == c)[0]
        if len(cluster) > min_users: # If there are at least 5 users in the same cluser, preserves it
            cluster_nodes[c] = cluster

    return cluster_nodes

In [18]:
edges_path = os.path.join('..', '..', 'src', 'data', 'edges.csv')
edges = pd.read_csv(edges_path)

save_dir = os.path.join('..', '..', 'src', 'graph_dir', 'subgraph_dir')
os.makedirs(save_dir, exist_ok=True)

for k, users in exp_clusters.items():
    save_path = os.path.join(save_dir, f'subgraph_{k}.pt')
    build_subgraph(k, users, edges, save_path)


Processing cluster : 0 with 1111 users.
Processing cluster : 1 with 922 users.
Processing cluster : 2 with 760 users.
Processing cluster : 3 with 480 users.
Processing cluster : 4 with 320 users.
Processing cluster : 5 with 217 users.
Processing cluster : 6 with 160 users.
Processing cluster : 7 with 140 users.
Processing cluster : 8 with 103 users.
Processing cluster : 9 with 6 users.
