# PCS-HELIO v4.3 — Standard environment and helpers
This notebook follows the v4.3 conventions (paths, token normalization, and reproducible outputs).

In [None]:
import time, re
from pathlib import Path
import pandas as pd

def heartbeat(m):
    print(f"[{time.strftime('%H:%M:%S')}] {m}")

def norm_token(s):
    if not isinstance(s, str):
        return s
    s = s.lower()
    s = re.sub(r"[\W_]+", "", s)
    return s

RAW_DIR = Path('../data/raw_public')
PROC_DIR = Path('../data/processed'); PROC_DIR.mkdir(parents=True, exist_ok=True)
RPT_DIR = Path('reports'); RPT_DIR.mkdir(parents=True, exist_ok=True)
heartbeat('Env ready (v4.3)')

[21:08:00] Env ready (v4.3)


In [None]:
# Environment & seeds (reproducibility)
import sys, platform, time, random
import numpy as np
import pandas as pd

SEED=42
random.seed(SEED)
np.random.seed(SEED)
print(f"[env] Python: {sys.version.split()[0]} | platform: {platform.platform()}")
print(f"[env] numpy: {np.__version__} | pandas: {pd.__version__}")
print(f"[env] seed: {SEED} at {time.strftime('%Y-%m-%d %H:%M:%S')}")

[env] Python: 3.11.13 | platform: Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.35
[env] numpy: 2.3.2 | pandas: 2.3.2
[env] seed: 42 at 2025-09-01 21:08:00


# Notebook 01: SWOW Loader
Carrega dados do *Small World of Words* usando streaming.

In [None]:
"""Run-All cell"""
import random, numpy as np
random.seed(42)
np.random.seed(42)

In [None]:
from pathlib import Path
import pandas as pd
import hashlib
import yaml

In [None]:
from pathlib import Path
import pandas as pd
import hashlib
import yaml
import zipfile

raw_candidates = [
    Path("../data/raw_public/swow/en/swow_en.csv"),
    Path("../data/raw_public/swow/swow_en.csv"),
    Path("../data/raw_public/swow_en.csv"),
]
zip_candidates = [
    Path("../data/raw_public/swow/en/swow_en.zip"),
    Path("../data/raw_public/swow/swow_en.zip"),
    Path("../data/raw_public/swow_en.zip"),
]
processed_dir = Path("../data/processed/swow/en")
processed_dir.mkdir(parents=True, exist_ok=True)

src_path = next((p for p in raw_candidates if p.exists()), None)
if src_path is None:
    zip_path = next((p for p in zip_candidates if p.exists()), None)
    if zip_path is not None:
        with zipfile.ZipFile(zip_path, 'r') as zf:
            inner_csv = next((n for n in zf.namelist() if n.endswith('.csv')), None)
            if inner_csv is None:
                print(f"No CSV found inside {zip_path}")
            else:
                df = pd.read_csv(zf.open(inner_csv), dtype=str)
                df.to_csv(processed_dir / "swow_en_tidy.csv", index=False)
                sha = hashlib.sha256((processed_dir / "swow_en_tidy.csv").read_bytes()).hexdigest()
                prov = {"source": str(zip_path), "inner": inner_csv, "generated": str(processed_dir / "swow_en_tidy.csv"), "sha256": sha}
                with open(processed_dir / "provenance.yaml", "w") as fh:
                    yaml.safe_dump(prov, fh)
                print(f"Wrote {processed_dir / 'swow_en_tidy.csv'} from zip")
    else:
        print("SWOW source not found. Expected one of: ")
        for p in raw_candidates + zip_candidates:
            print(f" - {p}")
else:
    # Stream-read for memory safety
    chunks = pd.read_csv(src_path, chunksize=10000, dtype=str)
    df = pd.concat(chunks, ignore_index=True)
    df.to_csv(processed_dir / "swow_en_tidy.csv", index=False)
    sha = hashlib.sha256((processed_dir / "swow_en_tidy.csv").read_bytes()).hexdigest()
    prov = {"source": str(src_path), "generated": str(processed_dir / "swow_en_tidy.csv"), "sha256": sha}
    with open(processed_dir / "provenance.yaml", "w") as fh:
        yaml.safe_dump(prov, fh)
    print(f"Wrote {processed_dir / 'swow_en_tidy.csv'}")

Wrote ../data/processed/swow/en/swow_en_tidy.csv from zip
