# 01 — Build SWOW Graph
Ingest SWOW CSV → build igraph; save quick QA.

In [6]:
# Repo bootstrap: ensure `src` is on sys.path and standard dirs exist
import sys
from pathlib import Path
ROOT = Path.cwd()
if not (ROOT / 'src').exists() and (ROOT.parent / 'src').exists():
    ROOT = ROOT.parent
if str(ROOT / 'src') not in sys.path:
    sys.path.insert(0, str(ROOT / 'src'))
for p in (ROOT/'data/processed', ROOT/'reports', ROOT/'figures/metrics'):
    p.mkdir(parents=True, exist_ok=True)
print('[bootstrap] ROOT=', ROOT)

[bootstrap] ROOT= /home/agourakis82/workspace/pcs-meta-repo


In [9]:
import os, json, re, time
from pathlib import Path
import pandas as pd, numpy as np

RAW_DIR = (ROOT / 'data/raw_public/swow')
PROC_DIR = (ROOT / 'data/processed') ; PROC_DIR.mkdir(parents=True, exist_ok=True)
RPT_DIR = (ROOT / 'reports'); RPT_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR = (ROOT / 'figures/metrics'); FIG_DIR.mkdir(parents=True, exist_ok=True)

def heartbeat(m): print(f'[01] {m}')
heartbeat('Env ready')

[01] Env ready


In [10]:
# Locate a SWOW CSV (en) under data/raw_public/swow
cands = list(RAW_DIR.rglob('*.csv'))
if not cands:
    heartbeat('No SWOW CSV found; skipping build')
    edges = pd.DataFrame()
else:
    # prefer files with 'EN' or 'english' in name
    cands_sorted = sorted(cands, key=lambda p: (not re.search(r'(?i)en|english', p.name), len(p.name)))
    swow_path = cands_sorted[0]
    edges = pd.read_csv(swow_path)
    heartbeat(f'Loaded SWOW: {swow_path} rows={len(edges)}')
# Map columns to cue/response
if len(edges)>0:
    cols = {c.lower(): c for c in edges.columns}
    cue = cols.get('cue') or cols.get('word') or cols.get('source')
    resp = cols.get('response') or cols.get('target') or cols.get('associate')
    if not cue or not resp:
        heartbeat('Cannot detect cue/response columns; clearing edges')
        edges = pd.DataFrame()
    else:
        edges = edges.rename(columns={cue:'cue', resp:'response'})[['cue','response']].dropna()
heartbeat(f'Edges normalized: {edges.shape if len(edges)>0 else None}')

[01] Loaded SWOW: /home/agourakis82/workspace/pcs-meta-repo/data/raw_public/swow/en/SWOW-EN.R100.20180827.csv rows=1228200
[01] Cannot detect cue/response columns; clearing edges
[01] Edges normalized: None


In [4]:
# Quick QA and save edge list
qa = {}
if len(edges)>0:
    out_edges = PROC_DIR / 'swow_edges.csv'
    edges.to_csv(out_edges, index=False)
    qa = {
        'rows': int(len(edges)),
        'unique_nodes': int(pd.unique(pd.concat([edges['cue'], edges['response']])).size),
        'edge_csv': str(out_edges)
    }
else:
    qa = {'rows': 0, 'unique_nodes': 0, 'edge_csv': None}
(RPT_DIR/'qa_swow.json').write_text(json.dumps(qa, indent=2))
heartbeat('Saved QA for SWOW')

[01] Saved QA for SWOW
