# PCS‑HELIO v4.3 — 01 · Build SWOW Graph
Ingest SWOW CSV → build graph; save tidy edges and QA.

In [1]:
from pathlib import Path; import json, re, os, sys
import pandas as pd, numpy as np
# Ensure repo root on sys.path so 'notebooks._fragments' resolves regardless of CWD
ROOT = Path.cwd()
if (ROOT/'notebooks'/'_fragments.py').exists():
    sys.path.insert(0, str(ROOT))
elif (ROOT.parent/'notebooks'/'_fragments.py').exists():
    sys.path.insert(0, str(ROOT.parent))
try:
    from notebooks._fragments import apply_style, preflight_checks, print_contract, save_manifest
except Exception as e:
    print('[preflight] Failed importing notebooks._fragments:', e)
    def apply_style(): pass
    def preflight_checks(): pass
    def print_contract(): pass
    def save_manifest(path, payload): Path(path).parent.mkdir(parents=True, exist_ok=True); Path(path).write_text(json.dumps(payload, indent=2))
apply_style(); preflight_checks(); print_contract()
RUN_MODE = os.environ.get('RUN_MODE','sample')
BASE=Path('.') ; DATA=BASE/'data' ; RAW=DATA/'raw_public'/'swow' ; PROC=DATA/'processed' ; RPTS=BASE/'reports' ; FIG=BASE/'figures'/'metrics'
(PROC/'swow').mkdir(parents=True, exist_ok=True); RPTS.mkdir(parents=True, exist_ok=True)
print('[01] Env ready — RUN_MODE=', RUN_MODE)


[STYLE] _style.css not found; proceeding.
[Preflight] Python: 3.12.11 | Platform: Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.35
[Preflight] pandas: 2.3.2 | numpy: 1.26.4
[Preflight] Folders ready.


[01] Env ready — RUN_MODE= full


In [2]:
# Locate a SWOW CSV (en) under data/raw_public/swow
cands = list(RAW.rglob('*.csv'))
if not cands:
    print('[01] No SWOW CSV found; creating synthetic sample (sample mode).')
    edges = pd.DataFrame({'cue':['cat','cat','dog','dog','music'], 'response':['pet','meow','pet','bark','sound']})
else:
    cands_sorted = sorted(cands, key=lambda p: (not re.search(r'(?i)en|english', p.name), len(p.name)))
    swow_path = cands_sorted[0]
    # In sample mode, read a limited chunk to keep fast
    if RUN_MODE=='sample':
        try:
            edges = pd.read_csv(swow_path, nrows=20000)
        except Exception:
            edges = pd.read_csv(swow_path)
    else:
        edges = pd.read_csv(swow_path)
    print(f'[01] Loaded SWOW: {swow_path} rows={len(edges)}')
# Map columns to cue/response
if len(edges)>0:
    cols = {c.lower(): c for c in edges.columns}
    cue = cols.get('cue') or cols.get('word') or cols.get('source')
    resp = cols.get('response') or cols.get('target') or cols.get('associate')
    if cue and resp:
        edges = edges.rename(columns={cue:'cue', resp:'response'})[['cue','response']].dropna()
    else:
        edges = edges[['cue','response']] if set(['cue','response']).issubset(edges.columns) else edges
# In sample mode, optionally downsample edges further
if RUN_MODE=='sample' and len(edges)>50000:
    edges = edges.sample(n=50000, random_state=1234)
print('[01] Edges normalized:', None if edges is None else edges.shape)
# Save tidy edge list
tidy_path = PROC/'swow'/'swow_tidy.csv'
edges.to_csv(tidy_path, index=False)
print('[01] Wrote', tidy_path)


[01] No SWOW CSV found; creating synthetic sample (sample mode).
[01] Edges normalized: (5, 2)
[01] Wrote data/processed/swow/swow_tidy.csv


In [3]:
# Build and persist a simple directed weighted graph (counts as weights)
import networkx as nx, pickle
G = nx.DiGraph()
if not edges.empty:
    for _, r in edges.groupby(['cue','response']).size().reset_index(name='w').iterrows():
        G.add_edge(r['cue'], r['response'], weight=float(r['w']))
graph_path = PROC/'swow_graph_en.pkl'
pickle.dump(G, open(graph_path, 'wb'))
# QA manifest
qa = {'rows': int(len(edges)), 'unique_nodes': int(pd.unique(pd.concat([edges['cue'], edges['response']])).size), 'edge_csv': str(tidy_path), 'graph_pkl': str(graph_path)}
(RPTS/'qa_swow.json').write_text(json.dumps(qa, indent=2))
save_manifest(RPTS/'swow_manifest.json', qa)
print('[01] Saved QA and graph')


[MANIFEST] Wrote reports/swow_manifest.json
[01] Saved QA and graph
