In [None]:
# --- College Earnings Predictor: notebook bootstrap (Cell 1) ---
# Purpose: make the notebook portable, set constants, and prep artifact paths.

from __future__ import annotations
import os, sys, json, datetime as dt
from pathlib import Path

# ↳ 1) Find the repo root (df-jsx) no matter where Jupyter was launched
def find_repo_root(start: Path = Path.cwd()) -> Path:
    p = start.resolve()
    while p != p.parent:
        # heuristics: both server/routers and client/ exist in the project root
        if (p / "server" / "routers").exists() and (p / "client").exists():
            return p
        p = p.parent
    return start.resolve()

REPO_ROOT = find_repo_root()
print(f"[paths] REPO_ROOT = {REPO_ROOT}")

# Ensure repo root is importable if you want local modules
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

# ↳ 2) Notebook-local data directories (raw downloads & scratch)
NB_DIR = REPO_ROOT / "notebooks" / "college_earnings"
RAW_DATA_DIR = NB_DIR / "data"
OUTPUTS_DIR = NB_DIR / "outputs"
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

# ↳ 3) Model identifiers & constants (edit as needed)
MODEL_NAME   = "college_earnings"
VERSION      = "v1_75k_5y"      # keep lowercase "k" to match artifact folder name
HORIZON      = "p6"             # ~6 years (proxy for 5–6y)
TARGET_USD   = 75_000           # threshold for ≥ $75k
RANDOM_SEED  = 42

# ↳ 4) Artifact directory (where the FastAPI route already looks)
ARTIFACT_DIR = REPO_ROOT / "server" / "routers" / "models" / MODEL_NAME / VERSION
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

# Optional: training report filename to export later via nbconvert
REPORT_PDF_PATH = ARTIFACT_DIR / "training_report.pdf"

print(f"[paths] RAW_DATA_DIR     = {RAW_DATA_DIR}")
print(f"[paths] OUTPUTS_DIR      = {OUTPUTS_DIR}")
print(f"[paths] ARTIFACT_DIR     = {ARTIFACT_DIR}")
print(f"[paths] REPORT_PDF_PATH  = {REPORT_PDF_PATH}")

# ↳ 5) Helpers for consistent saving/logging
def save_json(data: dict, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(data, indent=2))
    print(f"[save] {path.relative_to(REPO_ROOT)} ({path.stat().st_size} bytes)")

def utcnow() -> str:
    return dt.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"

def log(msg: str) -> None:
    print(f"[{dt.datetime.now().strftime('%H:%M:%S')}] {msg}")

# ↳ 6) Planned artifact filenames (for later cells to use)
ENCODERS_JSON      = ARTIFACT_DIR / "encoders.json"
FIXED_EFFECTS_JSON = ARTIFACT_DIR / "fixed_effects.json"
RAND_STATE_JSON    = ARTIFACT_DIR / "random_state.json"
RAND_CIP_JSON      = ARTIFACT_DIR / "random_cip.json"
CALIB_JSON         = ARTIFACT_DIR / "calibration.json"
THRESHOLDS_JSON    = ARTIFACT_DIR / "thresholds.json"
METADATA_JSON      = ARTIFACT_DIR / "metadata.json"

# ↳ 7) (Optional) S3 settings if you later want to upload from the notebook
USE_S3_UPLOAD = bool(int(os.getenv("EARNINGS_USE_S3_UPLOAD", "0")))  # set 1 to enable
S3_BUCKET     = os.getenv("EARNINGS_S3_BUCKET", "your-bucket-name")
S3_PREFIX     = f"models/{MODEL_NAME}/{VERSION}/"

print(f"[s3] USE_S3_UPLOAD={USE_S3_UPLOAD}  bucket={S3_BUCKET}  prefix={S3_PREFIX}")

# Sanity ping
log("Notebook bootstrap complete. Proceed to data ingest…")
