In [1]:
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
import networkx as nx
from itertools import combinations


In [2]:
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("perspectiva-solution/embeddings-gdn-question-163")['train']
df = ds.to_pandas()
df = df[df.cluster_id != -1].sample(n=10000, random_state=1234) # Remove idea which were not clustered

In [3]:
# TODO Valuation on edges
G = nx.Graph()
G.add_nodes_from(df.contribution_id) # A node is a contribution

# Add edge if two nodes belong to the same cluster
for title, subdf in df.groupby("cluster_title"):
    edges = tuple(combinations(subdf.contribution_id.to_list(), 2))
    G.add_edges_from(edges)

In [19]:
partition = nx.community.louvain_communities(G, resolution=1, seed=2025)

In [20]:
# Big partitions could be personas
personas = []
threshold = len(df) // 100
for node_set in partition:
    if len(node_set) >= threshold:
        personas.append(node_set)


In [22]:
# Graph of Louvain communities
H = nx.Graph()
H.add_nodes_from(range(len(personas)))

In [68]:
def characterize_node(idx):
    """Return common contributions of the community

    Args:
        idx (_type_): _description_
    """
    msk = df.contribution_id.isin(personas[idx])
    community_demands = list(df[msk].groupby("cluster_id")["cluster_title"].unique())
    community_demands = df[msk].groupby("cluster_title")["cluster_title"].count()
    return (community_demands / community_demands.sum()).sort_values(ascending=False)

    
def make_title(demands, topk=5):
    title = ""
    for row_idx in range(topk):
        row_value = demands.iloc[row_idx]
        demand = demands.index[row_idx]
        title += f"{demand} ({row_value * 100:.2f}%)\n"
    return title


In [69]:
# Map n on size
for community in H.nodes:
    community_demands = characterize_node(community)
    H.nodes[community]["title"] = make_title(community_demands)
    H.nodes[community]["size"] = len(personas[community]) // 10

In [70]:
from pyvis.network import Network
nt = Network("500px", "500px")
nt.from_nx(H)
nt.toggle_physics(True)
nt.show("communities.html", notebook=False)

communities.html
