# PCS‑HELIO v4.3 — 02 · KEC Metrics
Compute Knowledge Entropy Curvature (KEC) metrics from SWOW; write standardized outputs.

In [None]:
from pathlib import Path
import os
from notebooks._fragments import apply_style, preflight_checks, print_contract
apply_style(); preflight_checks(); print_contract()
RUN_MODE = os.environ.get('RUN_MODE','sample')
BASE=Path('.') ; DATA=BASE/'data' ; PROC=DATA/'processed' ; RPTS=BASE/'reports'
(PROC/'kec').mkdir(parents=True, exist_ok=True)


# Notebook 02: KEC Metrics Computation
Using the SWOW graph from Notebook 01, this notebook computes the *Knowledge Entropy Curvature* (KEC) metrics for each node/word:
- **Transition Entropy:** The entropy of outgoing edge weight distribution (uncertainty of associations).
- **Local Curvature:** Graph curvature at the node (using Ollivier-Ricci or Forman's method).
- **Meso-scale Coherence:** Community-based coherence (e.g., modularity or cluster tightness around the node).

Ablation experiments (e.g., edge weight shuffling) and uncertainty estimation (bootstrap confidence intervals) are included.
- **Input:** Graph from Notebook 01 (or edge list).
- **Output:** Table of KEC metrics per word (saved to `data/processed/kec/metrics_{LANG}.csv`).

In [None]:
# Load the SWOW graph from Notebook 01
import pickle
LANG = 'en'
graph_file = str((PROC / f'swow_graph_{LANG}.pkl'))
with open(graph_file, 'rb') as f:
    G = pickle.load(f)
print(f"Graph loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges.")
# Create undirected version for community detection
G_ud = G.to_undirected()
# In sample mode, restrict to a subgraph of top-degree nodes for speed
if 'RUN_MODE' in globals() and RUN_MODE == 'sample' and G.number_of_nodes() > 1000:
    deg = sorted(G.degree, key=lambda x: x[1], reverse=True)
    keep = [n for n,_ in deg[:2000]]
    G = G.subgraph(keep).copy()
    G_ud = G_ud.subgraph(keep).copy()
print(f"Undirected graph created: {G_ud.number_of_nodes()} nodes, {G_ud.number_of_edges()} edges.")

In [34]:
# Assume G (graph) is available (from Notebook 01)
import math

# Compute transition entropy for each node
entropy = {}
for node in G.nodes():
    out_edges = G.out_edges(node, data='weight')
    total_w = sum([w for _,_,w in out_edges])
    H = 0.0
    for _, target, w in out_edges:
        p = w / total_w
        if p > 0:
            H -= p * math.log2(p)
    entropy[node] = H
print(f"Calculated entropy for {len(entropy)} nodes.")

# Save entropy as parquet
import pandas as pd
from pathlib import Path
OUTDIR = PROC/"kec"
OUTDIR.mkdir(parents=True, exist_ok=True)
ent_df = pd.DataFrame(list(entropy.items()), columns=['node', 'entropy'])
ent_df.to_parquet(OUTDIR/"entropy.parquet")
print(f"Saved entropy to {OUTDIR/'entropy.parquet'}")

Calculated entropy for 166540 nodes.
Saved entropy to ../data/processed/kec/entropy.parquet


In [35]:
# Compute local curvature (using a placeholder or external library)
try:
    import GraphRicciCurvature
    # Using Ollivier-Ricci from an external lib if available
    orc = GraphRicciCurvature.OllivierRicci(G, alpha=0.5, verbose=False)
    orc.compute_ricci_curvature()
    curvature = {edge: data['ricciCurvature'] for edge, data in orc.G.edges.items()}
except ImportError:
    # Placeholder: approximate curvature by Forman's method as fallback
    curvature = {}
    for u,v in G.edges():
        curvature[(u,v)] = (G.degree(u) + G.degree(v) - 2)  # simplistic Forman proxy
print(f"Computed curvature for {len(curvature)} edges (sample edge curvatures shown below):")
print(list(curvature.items())[:5])

# Save curvature as parquet
import pandas as pd
from pathlib import Path
OUTDIR = PROC/"kec"
OUTDIR.mkdir(parents=True, exist_ok=True)
curv_df = pd.DataFrame(list(curvature.items()), columns=['edge', 'curvature'])
curv_df.to_parquet(OUTDIR/"curvature.parquet")
print(f"Saved curvature to {OUTDIR/'curvature.parquet'}")

Computed curvature for 1537892 edges (sample edge curvatures shown below):
[(('there', 'position'), 764), (('there', 'place'), 1151), (('there', 'point'), 975), (('there', 'over'), 1049), (('there', 'here'), 714)]
Saved curvature to ../data/processed/kec/curvature.parquet
Saved curvature to ../data/processed/kec/curvature.parquet


In [36]:
# Compute coherence: use community detection (e.g., Leiden) to get cluster assignments and measure cluster purity around node
# Note: greedy_modularity_communities can be slow on large graphs, consider using leidenalg if available
try:
    import leidenalg
    import igraph as ig
    # Convert NetworkX graph to igraph for faster community detection
    nx_g = G_ud  # Use undirected graph
    ig_g = ig.Graph.from_networkx(nx_g)
    partition = leidenalg.find_partition(ig_g, leidenalg.ModularityVertexPartition)
    communities = [set(ig_g.vs[comm]['_nx_name'] for comm in part) for part in partition]
    print("Used Leiden algorithm for community detection")
except ImportError:
    print("leidenalg not available, using NetworkX greedy_modularity_communities (may be slow)")
    import networkx.algorithms.community as nx_comm
    communities = nx_comm.greedy_modularity_communities(G_ud)  # Use undirected

node_to_comm = {}
for i, comm in enumerate(communities):
    for node in comm:
        node_to_comm[node] = i

coherence = {}
for node in G_ud.nodes():  # Use undirected for consistency
    # e.g., coherence = fraction of node's neighbors in same community
    neighbors = list(G_ud.neighbors(node))
    if neighbors:
        same_comm = sum(1 for n in neighbors if node_to_comm.get(n) == node_to_comm.get(node))
        coherence[node] = same_comm / len(neighbors)
    else:
        coherence[node] = None
print(f"Computed coherence for {len(coherence)} nodes.")

# Save communities and coherence as parquet
import pandas as pd
from pathlib import Path
OUTDIR = PROC/"kec"
OUTDIR.mkdir(parents=True, exist_ok=True)
comm_df = pd.DataFrame(list(node_to_comm.items()), columns=['node', 'community'])
comm_df.to_parquet(OUTDIR/"communities_leiden.parquet")
coh_df = pd.DataFrame(list(coherence.items()), columns=['node', 'coherence'])
coh_df.to_parquet(OUTDIR/"coherence.parquet")
print(f"Saved communities to {OUTDIR/'communities_leiden.parquet'}")
print(f"Saved coherence to {OUTDIR/'coherence.parquet'}")

Used Leiden algorithm for community detection
Computed coherence for 166540 nodes.
Computed coherence for 166540 nodes.
Saved communities to ../data/processed/kec/communities_leiden.parquet
Saved coherence to ../data/processed/kec/coherence.parquet
Saved communities to ../data/processed/kec/communities_leiden.parquet
Saved coherence to ../data/processed/kec/coherence.parquet


In [37]:
# Save the computed metrics
import pandas as pd
import os

# Create output directory
output_dir = str(PROC / 'kec')
os.makedirs(output_dir, exist_ok=True)

# Prepare data for saving
metrics_data = []
for node in G.nodes():
    row = {
        'node': node,
        'entropy': entropy.get(node, None),
        'degree': G.degree(node),
        'coherence': coherence.get(node, None) if 'coherence' in locals() else None
    }
    # Add curvature for edges connected to this node (average or max)
    node_curvatures = [curvature.get((node, neighbor), 0) for neighbor in G.neighbors(node)]
    row['avg_curvature'] = sum(node_curvatures) / len(node_curvatures) if node_curvatures else 0
    metrics_data.append(row)

# Save to CSV
metrics_df = pd.DataFrame(metrics_data)
output_file = os.path.join(output_dir, f'metrics_{LANG}.csv')
metrics_df.to_csv(output_file, index=False)
print(f"Saved KEC metrics for {len(metrics_data)} nodes to {output_file}")
print(f"Sample metrics:\n{metrics_df.head()}")

# Also save as parquet for QA
from pathlib import Path
OUTDIR = Path(output_dir)
metrics_df.to_parquet(OUTDIR/f"metrics_{LANG}.parquet")
print(f"Saved KEC metrics as parquet to {OUTDIR/f'metrics_{LANG}.parquet'}")

Saved KEC metrics for 166540 nodes to /home/agourakis82/workspace/pcs-meta-repo/notebooks/../data/processed/kec/metrics_en.csv
Sample metrics:
       node   entropy  degree  coherence  avg_curvature
0     there  5.481249     326   0.533333     708.580357
1  position  6.685050     440   0.451948     909.690217
2     place  6.138368     827   0.165354    1411.185714
3     point  6.705326     651   0.449324    1170.403409
4      true  5.858416     506   0.493363     992.325397
Saved KEC metrics as parquet to /home/agourakis82/workspace/pcs-meta-repo/notebooks/../data/processed/kec/metrics_en.parquet


In [38]:
# Checagem de alinhamento entre tabelas
n_nodes = len(set(ent_df["node"]) | set(comm_df["node"]) | set(coh_df["node"]))
coverage_kec = len(kec_df) / n_nodes if n_nodes else np.nan

In [39]:
import networkx as nx, random, pandas as pd
from pathlib import Path
import igraph as ig, leidenalg as la

random.seed(42)
# Copia o grafo não-direcionado já podado
assert 'G_ud' in globals(), "G_ud indisponível — rode as células anteriores."
G_null = G_ud.copy()

# Grau-preservado (double-edge-swap) — cuidado com custo; rode com 'nswap' proporcional às arestas
nswap = min(5 * G_null.number_of_edges(), 200000)  # limite de segurança
try:
    G_null = nx.double_edge_swap(G_null, nswap=nswap, max_tries=nswap*5, seed=42)
except Exception as e:
    print("Aviso: swap parcial:", e)

# Leiden em CPU para o nulo (poderia ser GPU, se quiser)
edges = [(u, v, G_null[u][v].get('weight',1.0)) for u, v in G_null.edges()]
g = ig.Graph.TupleList(edges, directed=False, edge_attrs=["weight"])
g.simplify(combine_edges={"weight":"sum"})
giant = g.clusters().giant()
part = la.find_partition(giant, la.RBConfigurationVertexPartition, 
                         weights="weight", resolution_parameter=1.0, seed=42)

node_ids = giant.vs["name"]
membership = part.membership
node_to_comm_null = dict(zip(node_ids, membership))

# Coerência no nulo (mesma função do caminho CPU, sem amostragem por simplicidade)
rows = []
for node in G_null.nodes():
    neigh = list(G_null.neighbors(node))
    if not neigh: 
        coh = None
    else:
        same = sum(1 for n in neigh if node_to_comm_null.get(n) == node_to_comm_null.get(node))
        coh = same/len(neigh)
    rows.append((node, coh))

coh_null = pd.DataFrame(rows, columns=["node","coherence_null"])
coh_null.to_parquet(str(PROC/ 'kec' / 'coherence_null.parquet'), index=False)
print("✓ coherence_null.parquet salvo")

# Pequena comparação resumida
coh_df = pd.read_parquet(str(PROC/ 'kec' / 'coherence.parquet'))
cmp_df = coh_df.merge(coh_null, on="node", how="inner").dropna()
delta = (cmp_df["coherence"] - cmp_df["coherence_null"]).describe().to_dict()
print("Δcoherence (real - nulo):", {k: round(v,4) if isinstance(v,(int,float)) else v for k,v in delta.items()})

  giant = g.clusters().giant()


✓ coherence_null.parquet salvo
Δcoherence (real - nulo): {'count': 166540.0, 'mean': 0.0521, 'std': 0.1549, 'min': -0.8333, '25%': 0.0, '50%': 0.0, '75%': 0.0, 'max': 0.9259}
Δcoherence (real - nulo): {'count': 166540.0, 'mean': 0.0521, 'std': 0.1549, 'min': -0.8333, '25%': 0.0, '50%': 0.0, '75%': 0.0, 'max': 0.9259}


In [40]:
# Finalize KEC metrics: ensure curvature alias and token_norm
import pandas as pd
from pathlib import Path
import re

kec_path = Path(str(PROC/ 'kec' / 'metrics_en.csv'))
if kec_path.exists():
    df = pd.read_csv(kec_path)
    df.columns = [c.strip() for c in df.columns]
    # curvature alias
    if 'avg_curvature' in df.columns and 'curvature' not in df.columns:
        df = df.rename(columns={'avg_curvature':'curvature'})
    # token_norm
    def norm_token(s):
        if not isinstance(s,str): return s
        s = s.lower()
        s = re.sub(r'[\W_]+','',s)
        return s
    src = 'node' if 'node' in df.columns else ('word' if 'word' in df.columns else None)
    if src and 'token_norm' not in df.columns:
        df['token_norm'] = df[src].astype(str).map(norm_token)
    # minimal required cols
    need = [c for c in ['token_norm','entropy','curvature','coherence'] if c in df.columns]
    if need:
        df.to_csv(kec_path, index=False)
        print(f"[OK] Rewrote {kec_path} with columns: {need}")
    else:
        print("[WARN] Required KEC columns missing; file left unchanged.")
else:
    print(f"[WARN] Missing {kec_path}; run Notebook 02 cells above to generate it.")

[OK] Rewrote ../data/processed/kec/metrics_en.csv with columns: ['token_norm', 'entropy', 'curvature', 'coherence']


In [41]:
# v4.3 finalization: enforce KEC output schema
from pathlib import Path
import pandas as pd, re
PROC=Path('data/processed'); KEC=PROC/'kec'; KEC.mkdir(parents=True, exist_ok=True)
path=KEC/'metrics_en.csv'
if path.exists():
    df=pd.read_csv(path)
    if 'curvature' not in df.columns and 'avg_curvature' in df.columns:
        df=df.rename(columns={'avg_curvature':'curvature'})
    if 'token_norm' not in df.columns:
        def norm_token(s):
            if not isinstance(s,str): return s
            s=s.lower(); s=re.sub(r'[\W_]+','',s); return s
        src='token_norm' if 'token_norm' in df.columns else ('node' if 'node' in df.columns else ('word' if 'word' in df.columns else ('Word' if 'Word' in df.columns else None)))
        if src:
            df['token_norm']=df[src].astype(str).map(norm_token)
    need=['token_norm','entropy','curvature','coherence']
    for c in need:
        if c not in df.columns:
            df[c]=pd.NA
    df=df[need]
    df.to_csv(path, index=False)
    print('[v4.3] Rewrote metrics_en.csv with columns', need)
else:
    print('[v4.3] KEC file not found; skip schema finalization')

[v4.3] Rewrote metrics_en.csv with columns ['token_norm', 'entropy', 'curvature', 'coherence']
