# PCS‑HELIO v4.3 — 02 · KEC Metrics
Compute Knowledge Entropy Curvature (KEC) metrics from SWOW; write standardized outputs.

In [1]:
from pathlib import Path
import os, sys, json
# Ensure repo root on sys.path so 'notebooks._fragments' resolves regardless of CWD
ROOT = Path.cwd()
if (ROOT/'notebooks'/'_fragments.py').exists():
    sys.path.insert(0, str(ROOT))
elif (ROOT.parent/'notebooks'/'_fragments.py').exists():
    sys.path.insert(0, str(ROOT.parent))
try:
    from notebooks._fragments import apply_style, preflight_checks, print_contract
except Exception as e:
    print('[preflight] Failed importing notebooks._fragments:', e)
    def apply_style(): pass
    def preflight_checks(): pass
    def print_contract(): pass
apply_style(); preflight_checks(); print_contract()
RUN_MODE = os.environ.get('RUN_MODE','sample')
BASE=Path('.') ; DATA=BASE/'data' ; PROC=DATA/'processed' ; RPTS=BASE/'reports'
(PROC/'kec').mkdir(parents=True, exist_ok=True)


[STYLE] _style.css not found; proceeding.
[Preflight] Python: 3.12.11 | Platform: Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.35
[Preflight] pandas: 2.3.2 | numpy: 1.26.4
[Preflight] Folders ready.


# Notebook 02: KEC Metrics Computation
Using the SWOW graph from Notebook 01, this notebook computes the *Knowledge Entropy Curvature* (KEC) metrics for each node/word:
- **Transition Entropy:** The entropy of outgoing edge weight distribution (uncertainty of associations).
- **Local Curvature:** Graph curvature at the node (using Ollivier-Ricci or Forman's method).
- **Meso-scale Coherence:** Community-based coherence (e.g., modularity or cluster tightness around the node).

Ablation experiments (e.g., edge weight shuffling) and uncertainty estimation (bootstrap confidence intervals) are included.
- **Input:** Graph from Notebook 01 (or edge list).
- **Output:** Table of KEC metrics per word (saved to `data/processed/kec/metrics_{LANG}.csv`).

In [2]:
# Load the SWOW graph from Notebook 01
import pickle
LANG = 'en'
graph_file = str((PROC / f'swow_graph_{LANG}.pkl'))
with open(graph_file, 'rb') as f:
    G = pickle.load(f)
print(f"Graph loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges.")
# Create undirected version for community detection
G_ud = G.to_undirected()
# In sample mode, restrict to a subgraph of top-degree nodes for speed
if 'RUN_MODE' in globals() and RUN_MODE == 'sample' and G.number_of_nodes() > 1000:
    deg = sorted(G.degree, key=lambda x: x[1], reverse=True)
    keep = [n for n,_ in deg[:2000]]
    G = G.subgraph(keep).copy()
    G_ud = G_ud.subgraph(keep).copy()
print(f"Undirected graph created: {G_ud.number_of_nodes()} nodes, {G_ud.number_of_edges()} edges.")

Graph loaded: 7 nodes, 5 edges.
Undirected graph created: 7 nodes, 5 edges.


In [3]:
# Assume G (graph) is available (from Notebook 01)
import math

# Compute transition entropy for each node
entropy = {}
for node in G.nodes():
    out_edges = G.out_edges(node, data='weight')
    total_w = sum([w for _,_,w in out_edges])
    H = 0.0
    for _, target, w in out_edges:
        p = w / total_w
        if p > 0:
            H -= p * math.log2(p)
    entropy[node] = H
print(f"Calculated entropy for {len(entropy)} nodes.")

# Save entropy as parquet
import pandas as pd
from pathlib import Path
OUTDIR = PROC/"kec"
OUTDIR.mkdir(parents=True, exist_ok=True)
ent_df = pd.DataFrame(list(entropy.items()), columns=['node', 'entropy'])
ent_df.to_parquet(OUTDIR/"entropy.parquet")
print(f"Saved entropy to {OUTDIR/'entropy.parquet'}")

Calculated entropy for 7 nodes.
Saved entropy to data/processed/kec/entropy.parquet


In [4]:
# Compute local curvature (try Ollivier-Ricci; fallback to simple Forman-like proxy)
curvature = {}
try:
    try:
        from GraphRicciCurvature.OllivierRicci import OllivierRicci as _OR
    except Exception:
        try:
            from GraphRicciCurvature import OllivierRicci as _OR
        except Exception:
            _OR = None
    if _OR is not None:
        try:
            orc = _OR(G, alpha=0.5, verbose=False)
            orc.compute_ricci_curvature()
            curvature = {edge: data.get('ricciCurvature', data.get('ricciCurvature', 0.0)) for edge, data in orc.G.edges.items()}
        except Exception as e:
            print('[warn] OllivierRicci failed, using proxy:', e)
            curvature = {}
    if not curvature:
        # Fallback: approximate curvature by simple degree-based proxy (Forman-like)
        for u,v in G.edges():
            curvature[(u,v)] = float(G.degree(u) + G.degree(v) - 2)
except Exception as e:
    print('[warn] Curvature computation failed, using proxy:', e)
    curvature = {}
    for u,v in G.edges():
        curvature[(u,v)] = float(G.degree(u) + G.degree(v) - 2)
print(f"Computed curvature for {len(curvature)} edges (sample edge curvatures shown below):")
print(list(curvature.items())[:5])


Incorrect verbose level, option:["INFO","DEBUG","ERROR"], use "ERROR instead."


  return f(b)


  return f(b)


  return f(b)


  return f(b)


  return f(b)


Computed curvature for 5 edges (sample edge curvatures shown below):
[(('cat', 'meow'), 0.0), (('cat', 'pet'), 0.0), (('dog', 'bark'), 0.0), (('dog', 'pet'), 0.0), (('music', 'sound'), 0.0)]


In [5]:
# Compute coherence: use community detection (e.g., Leiden) to get cluster assignments and measure cluster purity around node
# Note: greedy_modularity_communities can be slow on large graphs, consider using leidenalg if available
try:
    import leidenalg
    import igraph as ig
    # Convert NetworkX graph to igraph for faster community detection
    nx_g = G_ud  # Use undirected graph
    ig_g = ig.Graph.from_networkx(nx_g)
    partition = leidenalg.find_partition(ig_g, leidenalg.ModularityVertexPartition)
    communities = [set(ig_g.vs[comm]['_nx_name'] for comm in part) for part in partition]
    print("Used Leiden algorithm for community detection")
except ImportError:
    print("leidenalg not available, using NetworkX greedy_modularity_communities (may be slow)")
    import networkx.algorithms.community as nx_comm
    communities = nx_comm.greedy_modularity_communities(G_ud)  # Use undirected

node_to_comm = {}
for i, comm in enumerate(communities):
    for node in comm:
        node_to_comm[node] = i

coherence = {}
for node in G_ud.nodes():  # Use undirected for consistency
    # e.g., coherence = fraction of node's neighbors in same community
    neighbors = list(G_ud.neighbors(node))
    if neighbors:
        same_comm = sum(1 for n in neighbors if node_to_comm.get(n) == node_to_comm.get(node))
        coherence[node] = same_comm / len(neighbors)
    else:
        coherence[node] = None
print(f"Computed coherence for {len(coherence)} nodes.")

# Save communities and coherence as parquet
import pandas as pd
from pathlib import Path
OUTDIR = PROC/"kec"
OUTDIR.mkdir(parents=True, exist_ok=True)
comm_df = pd.DataFrame(list(node_to_comm.items()), columns=['node', 'community'])
comm_df.to_parquet(OUTDIR/"communities_leiden.parquet")
coh_df = pd.DataFrame(list(coherence.items()), columns=['node', 'coherence'])
coh_df.to_parquet(OUTDIR/"coherence.parquet")
print(f"Saved communities to {OUTDIR/'communities_leiden.parquet'}")
print(f"Saved coherence to {OUTDIR/'coherence.parquet'}")

leidenalg not available, using NetworkX greedy_modularity_communities (may be slow)
Computed coherence for 7 nodes.
Saved communities to data/processed/kec/communities_leiden.parquet
Saved coherence to data/processed/kec/coherence.parquet


In [6]:
# Save the computed metrics
import pandas as pd
import os

# Create output directory
output_dir = str(PROC / 'kec')
os.makedirs(output_dir, exist_ok=True)

# Prepare data for saving
metrics_data = []
for node in G.nodes():
    row = {
        'node': node,
        'entropy': entropy.get(node, None),
        'degree': G.degree(node),
        'coherence': coherence.get(node, None) if 'coherence' in locals() else None
    }
    # Add curvature for edges connected to this node (average or max)
    node_curvatures = [curvature.get((node, neighbor), 0) for neighbor in G.neighbors(node)]
    row['avg_curvature'] = sum(node_curvatures) / len(node_curvatures) if node_curvatures else 0
    metrics_data.append(row)

# Save to CSV
metrics_df = pd.DataFrame(metrics_data)
output_file = os.path.join(output_dir, f'metrics_{LANG}.csv')
metrics_df.to_csv(output_file, index=False)
print(f"Saved KEC metrics for {len(metrics_data)} nodes to {output_file}")
print(f"Sample metrics:\n{metrics_df.head()}")

# Also save as parquet for QA
from pathlib import Path
OUTDIR = Path(output_dir)
metrics_df.to_parquet(OUTDIR/f"metrics_{LANG}.parquet")
print(f"Saved KEC metrics as parquet to {OUTDIR/f'metrics_{LANG}.parquet'}")

Saved KEC metrics for 7 nodes to data/processed/kec/metrics_en.csv
Sample metrics:
   node  entropy  degree  coherence  avg_curvature
0   cat      1.0       2        0.5            0.0
1  meow      0.0       1        1.0            0.0
2   pet      0.0       2        0.5            0.0
3   dog      1.0       2        1.0            0.0
4  bark      0.0       1        1.0            0.0
Saved KEC metrics as parquet to data/processed/kec/metrics_en.parquet


In [7]:
# Alignment check across produced tables (safe)
import pandas as pd, numpy as np
from pathlib import Path
PROC=Path('data/processed')
kec_path = PROC/'kec'/'metrics_en.csv'
ent_path = PROC/'kec'/'entropy.parquet'
coh_path = PROC/'kec'/'coherence.parquet'
comm_path= PROC/'kec'/'communities_leiden.parquet'
try:
    df_kec = pd.read_csv(kec_path) if kec_path.exists() else pd.DataFrame()
    df_ent = pd.read_parquet(ent_path) if ent_path.exists() else pd.DataFrame()
    df_coh = pd.read_parquet(coh_path) if coh_path.exists() else pd.DataFrame()
    df_com = pd.read_parquet(comm_path) if comm_path.exists() else pd.DataFrame()
    n_nodes = int(len(set(df_ent.get('node', pd.Series())) | set(df_coh.get('node', pd.Series())) | set(df_com.get('node', pd.Series()))))
    coverage_kec = (len(df_kec) / n_nodes) if n_nodes else np.nan
    print({'n_nodes': n_nodes, 'kec_rows': int(len(df_kec)), 'coverage_kec': float(coverage_kec) if coverage_kec==coverage_kec else None})
except Exception as e:
    print('[warn] alignment check failed:', e)


{'n_nodes': 7, 'kec_rows': 7, 'coverage_kec': 1.0}


In [8]:
import networkx as nx, random, pandas as pd
from pathlib import Path
random.seed(42)
# Copy undirected graph
assert 'G_ud' in globals(), 'G_ud indisponível — rode as células anteriores.'
G_null = G_ud.copy()
# Degree-preserving swap (bounded for safety)
nswap = min(5 * G_null.number_of_edges(), 200000)
try:
    G_null = nx.double_edge_swap(G_null, nswap=nswap, max_tries=nswap*5, seed=42)
except Exception as e:
    print('Aviso: swap parcial:', e)
# Community detection: prefer leidenalg if available, else NetworkX
node_to_comm_null = {}
try:
    import igraph as ig, leidenalg as la
    edges = [(u, v, G_null[u][v].get('weight',1.0)) for u, v in G_null.edges()]
    g = ig.Graph.TupleList(edges, directed=False, edge_attrs=['weight'])
    g.simplify(combine_edges={'weight':'sum'})
    giant = g.clusters().giant()
    part = la.find_partition(giant, la.RBConfigurationVertexPartition, weights='weight', resolution_parameter=1.0, seed=42)
    node_ids = giant.vs['name']
    membership = part.membership
    node_to_comm_null = dict(zip(node_ids, membership))
except Exception as e:
    print('[warn] leidenalg unavailable; using NetworkX greedy modularity:', e)
    import networkx.algorithms.community as nx_comm
    communities = nx_comm.greedy_modularity_communities(G_null)
    for i, comm in enumerate(communities):
        for n in comm:
            node_to_comm_null[n] = i
# Coherence on null graph
rows = []
for node in G_null.nodes():
    neigh = list(G_null.neighbors(node))
    if not neigh:
        coh = None
    else:
        same = sum(1 for n in neigh if node_to_comm_null.get(n) == node_to_comm_null.get(node))
        coh = same/len(neigh)
    rows.append((node, coh))
coh_null = pd.DataFrame(rows, columns=['node','coherence_null'])
coh_null.to_parquet(str(PROC/ 'kec' / 'coherence_null.parquet'), index=False)
print('✓ coherence_null.parquet salvo')
# Summary compare
import pandas as pd
coh_df = pd.read_parquet(str(PROC/ 'kec' / 'coherence.parquet'))
cmp_df = coh_df.merge(coh_null, on='node', how='inner').dropna()
delta = (cmp_df['coherence'] - cmp_df['coherence_null']).describe().to_dict()
print('Δcoherence (real - nulo):', {k: round(v,4) if isinstance(v,(int,float)) else v for k,v in delta.items()})


[warn] leidenalg unavailable; using NetworkX greedy modularity: No module named 'leidenalg'
✓ coherence_null.parquet salvo
Δcoherence (real - nulo): {'count': 7.0, 'mean': 0.0, 'std': 0.0, 'min': 0.0, '25%': 0.0, '50%': 0.0, '75%': 0.0, 'max': 0.0}


In [9]:
# Finalize KEC metrics: ensure curvature alias and token_norm
import pandas as pd
from pathlib import Path
import re

kec_path = Path(str(PROC/ 'kec' / 'metrics_en.csv'))
if kec_path.exists():
    df = pd.read_csv(kec_path)
    df.columns = [c.strip() for c in df.columns]
    # curvature alias
    if 'avg_curvature' in df.columns and 'curvature' not in df.columns:
        df = df.rename(columns={'avg_curvature':'curvature'})
    # token_norm
    def norm_token(s):
        if not isinstance(s,str): return s
        s = s.lower()
        s = re.sub(r'[\W_]+','',s)
        return s
    src = 'node' if 'node' in df.columns else ('word' if 'word' in df.columns else None)
    if src and 'token_norm' not in df.columns:
        df['token_norm'] = df[src].astype(str).map(norm_token)
    # minimal required cols
    need = [c for c in ['token_norm','entropy','curvature','coherence'] if c in df.columns]
    if need:
        df.to_csv(kec_path, index=False)
        print(f"[OK] Rewrote {kec_path} with columns: {need}")
    else:
        print("[WARN] Required KEC columns missing; file left unchanged.")
else:
    print(f"[WARN] Missing {kec_path}; run Notebook 02 cells above to generate it.")

[OK] Rewrote data/processed/kec/metrics_en.csv with columns: ['token_norm', 'entropy', 'curvature', 'coherence']


In [10]:
# v4.3 finalization: enforce KEC output schema
from pathlib import Path
import pandas as pd, re
PROC=Path('data/processed'); KEC=PROC/'kec'; KEC.mkdir(parents=True, exist_ok=True)
path=KEC/'metrics_en.csv'
if path.exists():
    df=pd.read_csv(path)
    if 'curvature' not in df.columns and 'avg_curvature' in df.columns:
        df=df.rename(columns={'avg_curvature':'curvature'})
    if 'token_norm' not in df.columns:
        def norm_token(s):
            if not isinstance(s,str): return s
            s=s.lower(); s=re.sub(r'[\W_]+','',s); return s
        src='token_norm' if 'token_norm' in df.columns else ('node' if 'node' in df.columns else ('word' if 'word' in df.columns else ('Word' if 'Word' in df.columns else None)))
        if src:
            df['token_norm']=df[src].astype(str).map(norm_token)
    need=['token_norm','entropy','curvature','coherence']
    for c in need:
        if c not in df.columns:
            df[c]=pd.NA
    df=df[need]
    df.to_csv(path, index=False)
    print('[v4.3] Rewrote metrics_en.csv with columns', need)
else:
    print('[v4.3] KEC file not found; skip schema finalization')

[v4.3] Rewrote metrics_en.csv with columns ['token_norm', 'entropy', 'curvature', 'coherence']
