# 1. Network Metrics

- How connected is the network?

- Which clusters are most central?

- Are there isolated topics?

- Are there groups of clusters that stick together (communities)?

- Is there a difference between the UK and US networks?

density, 

size, etc (basics), 

avg degree (weighted), 

clustering co-efficient, 

top-nodes for each centrality measure (falseness, degree centrality) for each nw

pairs: e.g "Depression" cluster for female connected to "Reproduction Health" cluster vs.  (what is for male?)

In [14]:
import json
import os
import pandas as pd
import networkx as nx
import numpy as np

import community.community_louvain as community_louvain # import Louvain algo for community detection (calculate modularity)
import matplotlib.pyplot as plt
from collections import Counter

base_dir = os.path.abspath("..")
file_path = os.path.join(base_dir, "data", "bipartite_network", "general_patterns", "demographics", "country", "UK - US")
# output_path = os.path.join(base_dir, "data", "bipartite_network", "general_patterns", "demographics", "ethnicity")
uk_file = os.path.join(file_path, "cluster_co-occurrence_United Kingdom.json")
us_file = os.path.join(file_path, "cluster_co-occurrence_United States.json")

def load_graph(json_path):
    # Load edge list ( {"source": "Hematology & Blood Disorders", "target": "Procedures, Surgeries & Medical Devices", "weight": 2}
    with open(json_path, "r", encoding = "utf-8") as f:
        edges = json.load(f)
        
    G = nx.Graph()

    # Load the edge list into a weighted (undirected) graph
    for entry in edges:
        u = entry["source"]  # source node (cluster)
        v = entry["target"]  # target node (cluster)
        w = entry["weight"]
        G.add_edge(u, v, weight = w)
        
    return G

def analyze_graph(G, label):
    analysis = {}  # store graph's stats

    # Basic stats
    analysis["label"] = label
    analysis["nodes"] = G.number_of_nodes()  # number of nodes (clusters) in a graph
    analysis["edges"] = G.number_of_edges()
    analysis["density"] = nx.density(G)  # [0-1] number of edges compared to fully connected graph (ratio of actual to all possible connections)
    # 0 -> sparse; 1 -> fully connected

    # Degree
    degrees = dict(G.degree())  # number of neighbours of a node (cluster)
    weighted_degrees = dict(G.degree(weight="weight"))  # sum of egde weights
    analysis["average_degree"] = np.mean(list(degrees.values())) # how many (avg) other clusters is auch cluster connected to
    analysis["average_weighted_degree"] = np.mean(list(weighted_degrees.values()))  # how frequent (strong) are these connections btw clusters

    # Connectedness of a graph (local (Clustering Coefficient) & globally (Connected Components)
    
    # Clustering Coefficient (how often neighbours of a node (cluster) are connected)
    # If cluster A is connected to B and C, how likely that B and C are connected too?
    clustering = nx.clustering(G, weight="weight")
    analysis["average_clustering"] = np.mean(list(clustering.values()))  # avg across all nodes (cluster)

    # Connected Components (cc) (see how many disconnected groups of clusters a graph has: a graph is unified or fragmented)
    # reachability: in connected components, all the nodes are always reachable from each other
    components = list(nx.connected_components(G))
    largest_cc = max(components, key=len)  # largest cc
    analysis["num_components"] = len(components) # total number of cc ( 1 if everything is connected)
    analysis["largest_component_size"] = len(largest_cc) # how many nodes (clusters) are in the largest group
    
    # Diameter (the longest shortest path btw any 2 nodes (clusters) in a cc) -> computer diameter of the largest_cc
    G_largest_cc = G.subgraph(largest_cc)
    if nx.is_connected(G_largest_cc):
        analysis["diameter"] = nx.diameter(G_largest_cc)  # nx.diameter works on cc
    else:
        analysis["diameter"] = "Not connected"
    

    # Modularity ( [0-1] how well a graph can be divided into communities (groups of closely connected nodes))
    # 1 if the detected communities are well separated
    # 0 if mixed up
    partition = community_louvain.best_partition(G)  # Louvain algo to detect communities
    analysis["modularity"] = community_louvain.modularity(partition, G)
    
    # Centrlity Measures (nodes (clusters) of importance)
    dc = nx.degree_centrality(G)  # nodes with most connections
    bc = nx.betweenness_centrality(G, weight="weight")  # what clusters serve as bridges between others (nodes most important for information flow)
    
    # Top (n=5) nodes for each centrality measure
    analysis["top5_degree_centrality"] = sorted(dc.items(), key=lambda x: x[1], reverse=True)[:5]
    analysis["top5_betweenness_centrality"] = sorted(bc.items(), key=lambda x: x[1], reverse=True)[:5]

    # insted of lng which clustookier has the highest degree, we look into what pairs are most connected
    # get the edge with the biggest weight and get the corresponding clusters of that "heavy" edge
    # essentially: sort the original (json) list of edges by weights and get top (n=5) cluster-PAIRS (main point are pairs)

    return analysis

# Load JSON files (edges) as graphs
G_uk = load_graph(uk_file)
G_us = load_graph(us_file)

# Analyze graphs
uk_stats = analyze_graph(G_uk, "United Kingdom")
us_stats = analyze_graph(G_us, "United States")

summary_df = pd.DataFrame([uk_stats, us_stats])
summary_df
# summary_df = pd.DataFrame([uk_stats, us_stats])
# summary_df["top5_degree_centrality"].to_list()


Unnamed: 0,label,nodes,edges,density,average_degree,average_weighted_degree,average_clustering,num_components,largest_component_size,diameter,modularity,top5_degree_centrality,top5_betweenness_centrality
0,United Kingdom,23,235,0.928854,20.434783,436.956522,0.040114,1,23,2,0.028654,"[(Mental Health & Emotional Wellbeing, 1.0), (...","[(Allergy & Immunology, 0.4310966810966811), (..."
1,United States,23,245,0.968379,21.304348,406.956522,0.03173,1,23,2,0.083552,"[(Mental Health & Emotional Wellbeing, 1.0), (...","[(Allergy & Immunology, 0.33604989286807463), ..."
