In [1]:
# --- Imports ---
import os, json, math, io, zipfile, time, re, shutil, glob, pathlib, sys, subprocess, shlex, tempfile, warnings
from pathlib import Path
from datetime import datetime as dt, timedelta, timezone, date, datetime
from dateutil import parser as dateparser
from dateutil.relativedelta import relativedelta
from urllib.parse import urljoin, quote
from functools import reduce

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pytz
import xarray as xr
import rasterio
from rasterio.mask import mask

import shapely
from shapely import ops
from shapely.geometry import Point, Polygon, box, mapping
from shapely.ops import unary_union, transform as shp_transform

from pyproj import Transformer
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import folium
import ee  # Earth Engine

from sklearn.neighbors import BallTree
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from geopy.distance import geodesic

import yaml

warnings.filterwarnings("ignore", category=UserWarning)


def skip_if_exists(path: str) -> bool:
    return os.path.exists(path)


In [2]:
# --- Configuration loader (shared HeatShield/HydroPulse) ---
from pathlib import Path
import os
import re
import yaml

def load_env_file(path: Path) -> dict:
    env = {}
    if not path.exists():
        return env
    for line in path.read_text().splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, val = line.split("=", 1)
        env[key.strip()] = val.strip().strip('"').strip("'")
    return env

def apply_env_overrides() -> None:
    env = load_env_file(Path(".env"))
    for k, v in env.items():
        if v:
            os.environ[k] = v

def _fmt_yyyymmdd(s: str) -> str:
    # expects YYYY-MM-DD
    return re.sub(r"-", "", s.strip())

def _render_template(tpl: str, *, start: str, end: str, res_m: int, epsg: int, region: str) -> str:
    return tpl.format(
        start=_fmt_yyyymmdd(start),
        end=_fmt_yyyymmdd(end),
        res_m=int(res_m),
        epsg=int(epsg),
        region=str(region),
    )

def _resolve_out_dir(project_dir: Path, out_dir_value: str | None) -> Path:
    out_dir_value = out_dir_value or "results"
    p = Path(out_dir_value)
    return (p if p.is_absolute() else (project_dir / p)).resolve()

def _resolve_pathlike_keys(cfg: dict, out_dir: Path) -> None:
    """
    Resolve config values that look like relative file paths under out_dir.
    Rule: if key ends with _FILENAME, _PATH, _ZIP, _TIF_NAME, _CSV_NAME, _PARQUET_NAME, _TXT_NAME
    and the value is a relative path, make it absolute under out_dir.
    """
    suffixes = (
        "_FILENAME", "_PATH", "_ZIP",
        "_TIF_NAME", "_CSV_NAME", "_PARQUET_NAME", "_TXT_NAME",
        "_NETCDF_NAME", "_ZARR_NAME"
    )
    for k, v in list(cfg.items()):
        if not isinstance(v, str):
            continue
        if not k.endswith(suffixes):
            continue
        p = Path(v)
        if p.is_absolute():
            cfg[k] = str(p)
        else:
            cfg[k] = str((out_dir / p).resolve())

# Fail fast unless PROJECT_DIR is explicitly provided.
apply_env_overrides()

PROJECT_DIR = os.environ.get("PROJECT_DIR")
if not PROJECT_DIR:
    raise FileNotFoundError(
        "PROJECT_DIR is not set. Add PROJECT_DIR to .env or environment variables."
    )

PROJECT_DIR = Path(PROJECT_DIR).expanduser().resolve()
CONFIG_PATH = PROJECT_DIR / "config" / "config.yaml"
if not CONFIG_PATH.exists():
    raise FileNotFoundError(f"Missing config file: {CONFIG_PATH}")

CONFIG = yaml.safe_load(CONFIG_PATH.read_text()) or {}

# Set out_dir to an absolute path early.
OUT_DIR = _resolve_out_dir(PROJECT_DIR, CONFIG.get("out_dir", "results"))
CONFIG["out_dir"] = str(OUT_DIR)

# Optional: Apply env overrides ONLY for keys that exist in config.yaml.
# This prevents HeatShield-specific lists in shared code.
for key in list(CONFIG.keys()):
    env_val = os.environ.get(key)
    if env_val:
        CONFIG[key] = env_val

# Also allow a small, explicit allowlist for common optional overrides across projects.
for key in ["PURPLEAIR_SENSOR_INDEX"]:  # harmless if absent in HydroPulse config
    env_val = os.environ.get(key)
    if env_val:
        CONFIG[key] = env_val

# Render FINAL_DAILY_FILENAME from template if provided.
# Keeps downstream code stable: always refer to CONFIG["FINAL_DAILY_FILENAME"].
if "FINAL_DAILY_FILENAME_TEMPLATE" in CONFIG:
    region = CONFIG.get("region", "CA")
    start_date = CONFIG["start_date"]
    end_date = CONFIG["end_date"]
    res_m = CONFIG.get("grid_resolution_m", 3000)
    epsg = CONFIG.get("OPS_EPSG", CONFIG.get("CA_ALBERS_EPSG", 3310))
    CONFIG["FINAL_DAILY_FILENAME"] = _render_template(
        CONFIG["FINAL_DAILY_FILENAME_TEMPLATE"],
        start=start_date,
        end=end_date,
        res_m=res_m,
        epsg=epsg,
        region=region,
    )

# Resolve pathlike keys under out_dir (only for keys that exist).
_resolve_pathlike_keys(CONFIG, OUT_DIR)

# Create common directories only if they are referenced in config.
# (Avoid hardcoding "manual" for HydroPulse.)
for k, v in CONFIG.items():
    if k.endswith("_DIRNAME") and isinstance(v, str):
        os.makedirs(Path(CONFIG["out_dir"]) / v, exist_ok=True)

# EPSG constants (configurable)
WGS84_EPSG = int(CONFIG.get("WGS84_EPSG", 4326))
CA_ALBERS_EPSG = int(CONFIG.get("CA_ALBERS_EPSG", 3310))
OPS_EPSG = int(CONFIG.get("OPS_EPSG", CA_ALBERS_EPSG))

# Set working directory
os.chdir(PROJECT_DIR)

print(f"Config loaded from {CONFIG_PATH}")
print(f"Output dir: {CONFIG['out_dir']}")
print(f"Final daily filename: {CONFIG.get('FINAL_DAILY_FILENAME')}")

Config loaded from /Users/Shared/blueleaflabs/hydropulse/config/config.yaml
Output dir: /Users/Shared/blueleaflabs/hydropulse/results
Final daily filename: /Users/Shared/blueleaflabs/hydropulse/results/final_daily_grid_CA_3000m_epsg3310_20240601_20241031.parquet


In [9]:
from pathlib import Path

def resolve_out_path(path_str: str) -> str:
    """
    Resolve a path string relative to CONFIG['out_dir'] unless already absolute.
    Returns an absolute string path.
    """
    p = Path(path_str)
    if p.is_absolute():
        return str(p)
    return str((Path(CONFIG["out_dir"]) / p).resolve())

In [3]:
# --- Ensure California boundary and build 3 km grid clipped to land ---


# Config
res_m = int(CONFIG.get("grid_resolution_m", 3000))
out_epsg = int(CONFIG.get("crs_epsg", 4326))
out_dir = CONFIG["out_dir"]; os.makedirs(out_dir, exist_ok=True)
inset_buffer_m = int(CONFIG.get("coast_inset_m", 0))  # e.g. 5000
boundary_path = CONFIG.get("ca_boundary_path", None)

# 1) Ensure boundary: download Census cartographic boundary if missing
if not boundary_path or not os.path.exists(boundary_path):
    states_zip = os.path.join(out_dir, "cb_2023_us_state_20m.zip")
    if not os.path.exists(states_zip):
        url = CONFIG["CENSUS_STATES_ZIP_URL"]
        r = requests.get(url, timeout=int(CONFIG.get("CENSUS_STATES_TIMEOUT", 120))); r.raise_for_status()
        with open(states_zip, "wb") as f: f.write(r.content)
    # Read from zip directly and select California
    states = gpd.read_file(f"zip://{states_zip}")
    if states.empty:
        raise ValueError("Census states file loaded empty.")
    ca = states[states["STATEFP"].astype(str).str.zfill(2).eq("06")][["geometry"]]
    if ca.empty:
        raise ValueError("California polygon not found in Census states file.")
    boundary_path = os.path.join(out_dir, "california_boundary.gpkg")
    ca.to_file(boundary_path, driver="GPKG")
    CONFIG["ca_boundary_path"] = boundary_path  # persist for later cells

# 2) Load boundary, dissolve, project, optional inward buffer
b = gpd.read_file(boundary_path)
if b.crs is None: raise ValueError("Boundary file has no CRS.")
b = b[["geometry"]].copy()
b = b.to_crs(CA_ALBERS_EPSG)
b = gpd.GeoDataFrame(geometry=[b.unary_union], crs=f"EPSG:{CA_ALBERS_EPSG}")
if inset_buffer_m > 0:
    b.geometry = b.buffer(-inset_buffer_m)
    b = gpd.GeoDataFrame(geometry=[b.unary_union], crs=f"EPSG:{CA_ALBERS_EPSG}")

# 3) Build snapped rectilinear grid over boundary bounds in EPSG:3310
minx, miny, maxx, maxy = b.total_bounds
snap_down = lambda v, s: np.floor(v/s)*s
snap_up   = lambda v, s: np.ceil(v/s)*s
minx, miny = snap_down(minx, res_m), snap_down(miny, res_m)
maxx, maxy = snap_up(maxx, res_m), snap_up(maxy, res_m)

xs = np.arange(minx, maxx, res_m)
ys = np.arange(miny, maxy, res_m)
n_rect = len(xs)*len(ys)
if n_rect > 3_500_000:
    raise MemoryError(f"Grid too large ({n_rect:,}). Increase res_m or tile the state.")

cells, col_i, row_j = [], [], []
for j, y in enumerate(ys):
    for i, x in enumerate(xs):
        cells.append(box(x, y, x+res_m, y+res_m)); col_i.append(i); row_j.append(j)

gdf_proj = gpd.GeoDataFrame({"col_i": np.int32(col_i), "row_j": np.int32(row_j)},
                            geometry=cells, crs=f"EPSG:{CA_ALBERS_EPSG}")
gdf_proj["cell_area_m2"] = float(res_m)*float(res_m)
gdf_proj["grid_id"] = f"CA3310_{res_m}_" + gdf_proj["col_i"].astype(str) + "_" + gdf_proj["row_j"].astype(str)

# 4) Strict land clip and land fraction
gdf_proj = gpd.sjoin(gdf_proj, b, how="inner", predicate="intersects").drop(columns=["index_right"])
inter = gpd.overlay(gdf_proj[["grid_id","geometry"]], b, how="intersection", keep_geom_type=True)
inter["land_area_m2"] = inter.geometry.area
land = inter[["grid_id","land_area_m2"]].groupby("grid_id", as_index=False).sum()
gdf_proj = gdf_proj.merge(land, on="grid_id", how="left")
gdf_proj["land_area_m2"] = gdf_proj["land_area_m2"].fillna(0.0)
gdf_proj["land_frac"] = (gdf_proj["land_area_m2"] / gdf_proj["cell_area_m2"]).clip(0,1)
gdf_proj = gdf_proj[gdf_proj["land_frac"] > 0].reset_index(drop=True)

# 5) Reproject to requested output CRS and save
grid_gdf = gdf_proj.to_crs(out_epsg)

parquet_path = os.path.join(out_dir, f"grid_{res_m}m_CA.parquet")
grid_gdf.to_parquet(parquet_path, index=False)

geojson_path = os.path.join(out_dir, f"grid_{res_m}m_CA_head10.geojson")
grid_gdf.head(10).to_file(geojson_path, driver="GeoJSON")

# Diagnostics
cell_area_km2 = (res_m/1000.0)**2
eff_land_km2 = float((grid_gdf.get("land_frac",1.0) * cell_area_km2).sum())
print(f"Saved: {parquet_path}")
print(f"Cells: {len(grid_gdf):,}")
print(f"Effective land area ≈ {round(eff_land_km2):,} km²")
print(f"Implied cell size ≈ {round((eff_land_km2/len(grid_gdf))**0.5,2)} km")

grid_gdf.head()


  b = gpd.GeoDataFrame(geometry=[b.unary_union], crs=f"EPSG:{CA_ALBERS_EPSG}")


Saved: /Users/Shared/blueleaflabs/hydropulse/results/grid_3000m_CA.parquet
Cells: 46,495
Effective land area ≈ 410,516 km²
Implied cell size ≈ 2.97 km


Unnamed: 0,col_i,row_j,geometry,cell_area_m2,grid_id,land_area_m2,land_frac
0,215,0,"POLYGON ((-117.09883 32.52004, -117.09786 32.5...",9000000.0,CA3310_3000_215_0,3527256.0,0.391917
1,216,0,"POLYGON ((-117.06697 32.51921, -117.06599 32.5...",9000000.0,CA3310_3000_216_0,2924116.0,0.324902
2,217,0,"POLYGON ((-117.03511 32.51837, -117.03411 32.5...",9000000.0,CA3310_3000_217_0,1712565.0,0.190285
3,218,0,"POLYGON ((-117.00325 32.51751, -117.00224 32.5...",9000000.0,CA3310_3000_218_0,505542.7,0.056171
4,214,1,"POLYGON ((-117.12973 32.54795, -117.12876 32.5...",9000000.0,CA3310_3000_214_1,96905.13,0.010767


In [4]:
# --- Persist config + save grid (3310 ops copy, 4326 preview) + write metadata ---

# Inputs assumed from prior cell:
# - grid_gdf            : current grid GeoDataFrame (any CRS)
# - CONFIG              : dict with out_dir, grid_resolution_m, crs_epsg, ca_boundary_path
# - CA_ALBERS_EPSG=3310 : defined earlier

out_dir = CONFIG["out_dir"]; os.makedirs(out_dir, exist_ok=True)
res_m = int(CONFIG.get("grid_resolution_m", 3000))
out_epsg = int(CONFIG.get("crs_epsg", 4326))
boundary_path = CONFIG.get("ca_boundary_path")

# 1) Persist boundary path back to CONFIG 
if not boundary_path or not os.path.exists(boundary_path):
    raise FileNotFoundError("CONFIG['ca_boundary_path'] missing or invalid. Rebuild boundary.")
CONFIG["ca_boundary_path"] = boundary_path

config_runtime_path = os.path.join(out_dir, "config_runtime.json")
with open(config_runtime_path, "w") as f:
    json.dump(CONFIG, f, indent=2)
print("Saved:", config_runtime_path)

# 2) Ensure we have an EPSG:3310 version for spatial ops
if grid_gdf.crs is None:
    raise ValueError("grid_gdf has no CRS. Rebuild grid.")
grid_3310 = grid_gdf.to_crs(3310) if grid_gdf.crs.to_epsg() != 3310 else grid_gdf

# 3) Save operational GeoParquet in 3310 + lightweight WGS84 preview
parquet_3310 = os.path.join(out_dir, f"grid_{res_m}m_CA_epsg3310.parquet")
grid_3310.to_parquet(parquet_3310, index=False)
print("Saved:", parquet_3310, "| cells:", len(grid_3310))

# Optional small preview in 4326 for quick map checks
preview_4326 = grid_3310.to_crs(4326).head(500)  # cap to avoid huge files
geojson_preview = os.path.join(out_dir, f"grid_{res_m}m_CA_head500_epsg4326.geojson")
preview_4326.to_file(geojson_preview, driver="GeoJSON")
print("Saved:", geojson_preview)

# 4) Compute and save metadata
cell_area_km2 = (res_m/1000.0)**2
effective_land_km2 = float((grid_3310.get("land_frac", 1.0) * cell_area_km2).sum())
implied_cell_km = float((effective_land_km2 / len(grid_3310))**0.5)
minx, miny, maxx, maxy = grid_3310.total_bounds
bbox_km = ((maxx-minx)/1000.0, (maxy-miny)/1000.0)

meta = {
    "timestamp_utc": dt.utcnow().isoformat(timespec="seconds") + "Z",
    "grid_resolution_m": res_m,
    "crs_ops_epsg": 3310,
    "crs_export_default_epsg": out_epsg,
    "cells": int(len(grid_3310)),
    "effective_land_area_km2": round(effective_land_km2, 2),
    "implied_cell_km": round(implied_cell_km, 4),
    "bbox_km_width_height": [round(bbox_km[0], 2), round(bbox_km[1], 2)],
    "has_land_frac": bool("land_frac" in grid_3310.columns),
    "boundary_path": boundary_path,
    "parquet_3310_path": parquet_3310,
    "geojson_preview_4326_path": geojson_preview,
}

meta_path = os.path.join(out_dir, f"grid_{res_m}m_CA_meta.json")
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)
print("Saved:", meta_path)
meta


Saved: /Users/Shared/blueleaflabs/hydropulse/results/config_runtime.json
Saved: /Users/Shared/blueleaflabs/hydropulse/results/grid_3000m_CA_epsg3310.parquet | cells: 46495
Saved: /Users/Shared/blueleaflabs/hydropulse/results/grid_3000m_CA_head500_epsg4326.geojson
Saved: /Users/Shared/blueleaflabs/hydropulse/results/grid_3000m_CA_meta.json


{'timestamp_utc': '2026-01-12T18:50:37Z',
 'grid_resolution_m': 3000,
 'crs_ops_epsg': 3310,
 'crs_export_default_epsg': 4326,
 'cells': 46495,
 'effective_land_area_km2': 410516.3,
 'implied_cell_km': 2.9714,
 'bbox_km_width_height': [np.float64(915.0), np.float64(1059.0)],
 'has_land_frac': True,
 'boundary_path': '/Users/Shared/blueleaflabs/hydropulse/results/california_boundary.gpkg',
 'parquet_3310_path': '/Users/Shared/blueleaflabs/hydropulse/results/grid_3000m_CA_epsg3310.parquet',
 'geojson_preview_4326_path': '/Users/Shared/blueleaflabs/hydropulse/results/grid_3000m_CA_head500_epsg4326.geojson'}

In [7]:
# CDO data fetch and processing functions
# repeat some variables for clarity
OUT_DIR = CONFIG["out_dir"]
RAW_DIR = os.path.join(OUT_DIR, CONFIG["CDO_RAW_DIRNAME"])
CLEAN_DIR = os.path.join(OUT_DIR, CONFIG["CDO_CLEAN_DIRNAME"])
os.makedirs(RAW_DIR, exist_ok=True); os.makedirs(CLEAN_DIR, exist_ok=True)

def month_windows(start_date, end_date):
    s = dt.fromisoformat(start_date).date().replace(day=1)
    e = dt.fromisoformat(end_date).date()
    cur = s
    while cur <= e:
        nxt = (cur + relativedelta(months=1)) - relativedelta(days=1)
        yield cur.isoformat(), min(nxt, e).isoformat()
        cur = (cur + relativedelta(months=1)).replace(day=1)

def parse_attributes(attr):
    parts = (attr or "").split(","); parts += [""] * (4 - len(parts))
    mflag, qflag, sflag, obs_hhmm = parts[:4]
    return mflag or None, qflag or None, sflag or None, obs_hhmm or None

def fetch_cdo_page(session, url, headers, params, max_retries=None, base_delay=None, timeout=None):
    if max_retries is None:
        max_retries = int(CONFIG.get("CDO_MAX_RETRIES", 6))
    if base_delay is None:
        base_delay = float(CONFIG.get("CDO_BACKOFF_BASE", 0.8))
    if timeout is None:
        timeout = int(CONFIG.get("CDO_TIMEOUT", 180))
    for attempt in range(max_retries):
        try:
            r = session.get(url, headers=headers, params=params, timeout=timeout)
            if r.status_code in (429, 500, 502, 503, 504):
                raise requests.HTTPError(f"{r.status_code} retry")
            r.raise_for_status()
            return r.json()
        except Exception:
            if attempt == max_retries - 1:
                raise
            time.sleep(base_delay * (2 ** attempt))


def cdo_stream_monthly(datasetid, locationid, startdate, enddate, datatypes, token,
                       units="standard", page_limit=1000, force=False):
    url = CONFIG["CDO_BASE_URL"]
    headers = {"token": token}
    session = requests.Session()
    written = []

    for dtid in datatypes:
        for ms, me in month_windows(startdate, enddate):
            out_csv = os.path.join(RAW_DIR, f"ghcnd_{dtid}_{ms[:7]}.csv")
            if skip_if_exists(out_csv) and not force:
                # resume: skip existing month-datatype file
                written.append(out_csv); continue

            frames = []
            offset = 1
            while True:
                params = {
                    "datasetid": datasetid, "locationid": locationid,
                    "startdate": ms, "enddate": me,
                    "datatypeid": dtid, "units": units,
                    "limit": page_limit, "offset": offset
                }
                js = fetch_cdo_page(session, url, headers, params)
                rows = js.get("results", [])
                if not rows:
                    break
                frames.append(pd.json_normalize(rows))
                if len(rows) < page_limit:
                    break
                offset += page_limit
                time.sleep(0.15)  # gentle pacing

            if frames:
                df = pd.concat(frames, ignore_index=True)
                # normalize
                df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.date
                parsed = df["attributes"].apply(parse_attributes)
                df[["mflag","qflag","sflag","obs_hhmm"]] = pd.DataFrame(parsed.tolist(), index=df.index)
                # scale tenths
                scale = {"PRCP": 0.1, "TMAX": 0.1, "TMIN": 0.1}
                df["datatype"] = df["datatype"].astype(str)
                df["value"] = pd.to_numeric(df["value"], errors="coerce")
                df["value_scaled"] = df.apply(lambda r: r["value"] * scale.get(r["datatype"], 1.0), axis=1)
                # write monthly raw
                df[["date","datatype","station","attributes","mflag","qflag","sflag","obs_hhmm","value","value_scaled"]].to_csv(out_csv, index=False)
                written.append(out_csv)
            else:
                # create an empty file with header to mark completion
                with open(out_csv, "w", newline="") as f:
                    w = csv.writer(f); w.writerow(["date","datatype","station","attributes","mflag","qflag","sflag","obs_hhmm","value","value_scaled"])
                written.append(out_csv)
    return written

def build_clean_wide():
    # read all monthly raw files and assemble cleaned wide once
    files = sorted([os.path.join(RAW_DIR, f) for f in os.listdir(RAW_DIR) if f.endswith(".csv")])
    if not files:
        return None
    df = pd.concat((pd.read_csv(f, dtype={"datatype":str,"station":str}) for f in files), ignore_index=True)
    # convert types back
    df["date"] = pd.to_datetime(df["date"]).dt.date
    # keep good qflag
    df = df[(df["qflag"].isna()) | (df["qflag"]=="")]
    wide = (
        df.pivot_table(index=["station","date"], columns="datatype", values="value_scaled", aggfunc="mean")
          .reset_index()
          .rename(columns={"date":"obs_date","PRCP":"precipitation_mm","TMAX":"temperature_max_c","TMIN":"temperature_min_c"})
          .sort_values(["obs_date","station"])
    )
    # attach obs time from PRCP
    prcp_times = df[df["datatype"]=="PRCP"][["station","date","obs_hhmm"]].drop_duplicates().rename(columns={"date":"obs_date"})
    wide = wide.merge(prcp_times, on=["station","obs_date"], how="left")
    raw_all = os.path.join(OUT_DIR, "ghcnd_daily_raw_all.csv")
    wide_all = os.path.join(OUT_DIR, "ghcnd_daily_wide.csv")
    df.to_csv(raw_all, index=False)
    wide.to_csv(wide_all, index=False)
    return raw_all, wide_all, len(df), len(wide), wide["station"].nunique(), wide["obs_date"].nunique()

# ---- Run statewide with resume capability ----
token = os.environ.get("CDO_TOKEN") or CONFIG.get("CDO_TOKEN", "")
if token and token != "YOUR_NCEI_CDO_TOKEN":
    written = cdo_stream_monthly(
        datasetid="GHCND",
        locationid="FIPS:06",                      # California statewide
        startdate=CONFIG["start_date"],
        enddate=CONFIG["end_date"],
        datatypes=["TMAX","TMIN","PRCP"],
        token=token,
        units="standard",
        page_limit=1000,
        force=False                                 # set True to re-download
    )
    print(f"Monthly files written: {len(written)} → {RAW_DIR}")

    res = build_clean_wide()
    if res:
        raw_all, wide_all, n_raw, n_wide, n_stn, n_dates = res
        print(f"Saved raw:  {raw_all}")
        print(f"Saved wide: {wide_all}")
        print(f"Counts → raw: {n_raw} | wide: {n_wide} | stations: {n_stn} | dates: {n_dates}")
else:
    print("Skipping CDO (missing CDO token).")


Monthly files written: 15 → /Users/Shared/blueleaflabs/hydropulse/results/cdo_raw_monthly
Saved raw:  /Users/Shared/blueleaflabs/hydropulse/results/ghcnd_daily_raw_all.csv
Saved wide: /Users/Shared/blueleaflabs/hydropulse/results/ghcnd_daily_wide.csv
Counts → raw: 291781 | wide: 163475 | stations: 1305 | dates: 153


In [10]:
# === GHCND DAILY: raw (long) -> cleaned (wide with lat/lon in bbox) ===
# Input  (from your earlier step):  results/ghcnd_daily_raw_all.csv  (long form)
# Output (used by superset):        results/ghcnd_daily_cleaned.parquet  (wide per station-day with lat/lon)

# Need to do this because we aren't getting proper "joins" in our superset setup.


BASE = CONFIG["out_dir"]
RAW = resolve_out_path(CONFIG["GHCND_RAW_CSV_NAME"])
OUT_PARQ = resolve_out_path(CONFIG["GHCND_CLEAN_PARQUET_NAME"])
OUT_CSV = resolve_out_path(CONFIG["GHCND_CLEAN_CSV_NAME"])

assert os.path.exists(RAW), f"Missing raw GHCND file: {RAW}"

# 1) Ensure we have a station catalog with lat/lon
#    Prefer a local copy if you already saved one; otherwise download NOAA's reference once.
CAT_DIR = os.path.join(BASE, CONFIG["MANUAL_DIRNAME"]); os.makedirs(CAT_DIR, exist_ok=True)
CAT_TXT = os.path.join(CAT_DIR, CONFIG["GHCND_STATIONS_TXT_NAME"])

if not os.path.exists(CAT_TXT):
    url = CONFIG["GHCND_STATIONS_URL"]
    r = requests.get(url, timeout=int(CONFIG.get("CENSUS_STATES_TIMEOUT", 120))); r.raise_for_status()
    with open(CAT_TXT, "wb") as f: f.write(r.content)

# Parse ghcnd-stations.txt (fixed-width)
# Columns per docs: ID(1-11), LAT(13-20), LON(22-30), ELEV(32-37), STATE(39-40), NAME(42-71) ...
def parse_stations(path):
    recs = []
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if len(line) < 40: 
                continue
            sid = line[0:11].strip()
            try:
                lat = float(line[12:20].strip())
                lon = float(line[21:30].strip())
            except ValueError:
                continue
            state = line[38:40].strip()
            name  = line[41:71].strip()
            recs.append((sid, lat, lon, state, name))
    return pd.DataFrame(recs, columns=["station_core","lat","lon","state","name"])

stations = parse_stations(CAT_TXT)

# 2) Load your raw long-form CDO file
# Expected columns seen in your sample:
# ['attributes','datatype','date','mflag','obs_hhmm','qflag','sflag','station','value','value_scaled']
raw = pd.read_csv(RAW, low_memory=False)

# Normalize station key: raw uses "GHCND:USW00023232" → core "USW00023232"
raw["station_core"] = raw["station"].astype(str).str.replace("^GHCND:", "", regex=True)

# Pick a numeric value column: prefer value_scaled if present; else scale GHCND native units.
# GHCND native: PRCP = tenths of mm, TMAX/TMIN = tenths of °C.
have_scaled = "value_scaled" in raw.columns
def scaled_val(row):
    if have_scaled and pd.notna(row["value_scaled"]):
        return float(row["value_scaled"])
    v = pd.to_numeric(row["value"], errors="coerce")
    if pd.isna(v): 
        return np.nan
    if row["datatype"] == "PRCP":
        return v * 0.1             # → mm
    if row["datatype"] in ("TMAX","TMIN"):
        return v * 0.1             # → °C
    return v

raw["val_clean"] = raw.apply(scaled_val, axis=1)

# Filter to the analysis window if your raw contains more than needed
if "start_date" in CONFIG and "end_date" in CONFIG:
    sd = pd.to_datetime(CONFIG["start_date"], utc=True, errors="coerce")
    ed = pd.to_datetime(CONFIG["end_date"],   utc=True, errors="coerce")
    raw["date"] = pd.to_datetime(raw["date"], utc=True, errors="coerce")
    raw = raw[(raw["date"]>=sd) & (raw["date"]<=ed)]
else:
    raw["date"] = pd.to_datetime(raw["date"], utc=True, errors="coerce")

# 3) Keep only the datatypes we need and one value per (station,date,datatype)
keep_types = {"PRCP":"precipitation_mm", "TMAX":"temperature_max_c", "TMIN":"temperature_min_c"}
raw = raw[raw["datatype"].isin(keep_types.keys())].copy()

# If multiple rows per (station,date,datatype), average them
agg = (raw.groupby(["station_core","date","datatype"], as_index=False)["val_clean"]
          .mean())

# 4) Pivot to wide columns
wide = (agg.pivot(index=["station_core","date"], columns="datatype", values="val_clean")
           .reset_index())
# Rename columns to our canonical names
wide = wide.rename(columns={k:v for k,v in keep_types.items() if k in wide.columns})

# 5) Attach lat/lon from station catalog and clip to CA bbox
wide = wide.merge(stations[["station_core","lat","lon"]], on="station_core", how="left")

# Clip to CONFIG["bbox"] (California in your setup)
bbox = CONFIG["bbox"]
minx, miny, maxx, maxy = bbox["nwlng"], bbox["selat"], bbox["selng"], bbox["nwlat"]
in_box = (wide["lon"].between(minx, maxx)) & (wide["lat"].between(miny, maxy))
wide = wide[in_box].copy()

# 6) Final tidy columns + sorts
cols_order = ["station_core","date","lat","lon",
              "precipitation_mm","temperature_max_c","temperature_min_c"]
for c in cols_order:
    if c not in wide.columns: wide[c] = np.nan
wide = wide[cols_order].sort_values(["station_core","date"])

# 7) Save for the superset
wide.to_parquet(OUT_PARQ, index=False)
wide.to_csv(OUT_CSV, index=False)
print(f"Saved cleaned CDO daily → {OUT_PARQ} (rows={len(wide)}, stations={wide['station_core'].nunique()})")


Saved cleaned CDO daily → /Users/Shared/blueleaflabs/hydropulse/results/ghcnd_daily_cleaned.parquet (rows=163475, stations=1305)


In [11]:
# === HydroPulse | Final Daily Grid Builder (v0: GHCND PRCP only) ===
# Produces: CONFIG["FINAL_DAILY_FILENAME"] as a canonical daily grid table:
#   grid_id × date → prcp_mm + QC
#
# Inputs:
#   - Grid parquet (EPSG:3310): CONFIG["GRID_FILENAME"]
#   - GHCND cleaned station-day parquet: resolve_out_path(CONFIG["GHCND_CLEAN_PARQUET_NAME"])
#
# Notes for later HeatShield refactor:
#   - This cell establishes the common "final builder" contract: {grid_id, date, variables..., QC...}
#   - Keep the interface stable; only swap/extend source adapters per repo.

import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path
from sklearn.neighbors import BallTree

# -----------------------------
# Paths
# -----------------------------
OUT_DIR = Path(CONFIG["out_dir"])
GRID_PATH = Path(resolve_out_path(CONFIG["GRID_FILENAME"]))
GHCND_PATH = Path(resolve_out_path(CONFIG["GHCND_CLEAN_PARQUET_NAME"]))
FINAL_PATH = Path(resolve_out_path(CONFIG["FINAL_DAILY_FILENAME"]))

# Resume via daily shards
SHARDS_DIR = OUT_DIR / "derived" / "final_daily_shards_prcp"
SHARDS_DIR.mkdir(parents=True, exist_ok=True)

print("GRID_PATH:", GRID_PATH)
print("GHCND_PATH:", GHCND_PATH)
print("SHARDS_DIR:", SHARDS_DIR)
print("FINAL_PATH:", FINAL_PATH)

# -----------------------------
# Column contract (confirmed)
# -----------------------------
STATION_COL = "station_core"
DATE_COL    = "date"
LAT_COL     = "lat"
LON_COL     = "lon"
PRCP_COL_IN = "precipitation_mm"

# Output column naming for the final table
PRCP_COL_OUT = "prcp_mm"

# -----------------------------
# Tunables (can later move to config)
# -----------------------------
K = int(CONFIG.get("PRCP_IDW_K", 8))                        # k nearest stations
HARD_CAP_KM = float(CONFIG.get("PRCP_HARD_CAP_KM", 100.0))  # ignore stations beyond this radius
POWER = float(CONFIG.get("PRCP_IDW_POWER", 2.0))            # IDW power

OPS_EPSG = int(CONFIG.get("OPS_EPSG", 3310))
WGS84_EPSG = int(CONFIG.get("WGS84_EPSG", 4326))

# -----------------------------
# Helpers (keep these stable across repos)
# -----------------------------
def ensure_epsg(gdf: gpd.GeoDataFrame, epsg: int) -> gpd.GeoDataFrame:
    if gdf.crs is None:
        gdf = gdf.set_crs(f"EPSG:{WGS84_EPSG}")
    if (gdf.crs.to_epsg() or 0) != epsg:
        gdf = gdf.to_crs(epsg)
    return gdf

def ensure_grid_id(grid: gpd.GeoDataFrame) -> tuple[gpd.GeoDataFrame, str]:
    for c in ["grid_id", "cell_id", "id"]:
        if c in grid.columns:
            return grid, c
    grid = grid.copy()
    grid["grid_id"] = np.arange(len(grid), dtype=np.int32)
    return grid, "grid_id"

def build_balltree_from_points(geom: gpd.GeoSeries) -> BallTree:
    xy = np.column_stack([geom.x.values, geom.y.values])
    return BallTree(xy, metric="euclidean")

def idw(dist_m: np.ndarray, vals: np.ndarray, power: float) -> float:
    if np.any(dist_m == 0):
        return float(vals[np.argmin(dist_m)])
    w = 1.0 / np.power(dist_m, power)
    return float(np.sum(w * vals) / np.sum(w))

def interpolate_prcp_for_day(grid_centroids: gpd.GeoSeries, stations_day: gpd.GeoDataFrame) -> pd.DataFrame:
    """
    Returns a DF aligned to grid_centroids order with columns:
      prcp_mm, n_used, maxdist_km, method
    """
    n_cells = len(grid_centroids)
    out = pd.DataFrame({
        PRCP_COL_OUT: np.full(n_cells, np.nan, dtype=float),
        "n_used": np.zeros(n_cells, dtype=np.int16),
        "maxdist_km": np.full(n_cells, np.nan, dtype=float),
        "method": np.full(n_cells, None, dtype=object),
    })

    if stations_day is None or len(stations_day) == 0:
        return out

    # Build tree on station points
    tree = build_balltree_from_points(stations_day.geometry)
    vals = stations_day[PRCP_COL_IN].to_numpy(dtype=float)

    qxy = np.column_stack([grid_centroids.x.values, grid_centroids.y.values])
    k_eff = min(K, len(stations_day))
    dist_m, idx = tree.query(qxy, k=k_eff)

    hard_cap_m = HARD_CAP_KM * 1000.0

    for i in range(n_cells):
        d = dist_m[i]
        j = idx[i]

        # Apply hard cap
        mask = d <= hard_cap_m
        if not np.any(mask):
            continue

        d_use = d[mask]
        v_use = vals[j[mask]]

        # Drop NaNs (defensive)
        good = np.isfinite(v_use)
        d_use = d_use[good]
        v_use = v_use[good]
        if len(v_use) == 0:
            continue

        if len(v_use) == 1:
            out.at[i, PRCP_COL_OUT] = float(v_use[0])
            out.at[i, "method"] = "nearest"
        else:
            out.at[i, PRCP_COL_OUT] = idw(d_use, v_use, power=POWER)
            out.at[i, "method"] = f"idw_k{len(v_use)}"

        out.at[i, "n_used"] = int(len(v_use))
        out.at[i, "maxdist_km"] = float(np.max(d_use) / 1000.0)

    return out

# -----------------------------
# Load grid (EPSG:3310) and compute centroids
# -----------------------------
grid = gpd.read_parquet(GRID_PATH)
grid = ensure_epsg(grid, OPS_EPSG)
grid, GID_COL = ensure_grid_id(grid)
centroids = grid.geometry.centroid

print(f"Grid: {len(grid)} cells | CRS EPSG: {grid.crs.to_epsg()} | id col: {GID_COL}")

# -----------------------------
# Load cleaned GHCND station-day table
# -----------------------------
cdo = pd.read_parquet(GHCND_PATH)

required = [STATION_COL, DATE_COL, LAT_COL, LON_COL, PRCP_COL_IN]
missing = [c for c in required if c not in cdo.columns]
if missing:
    raise KeyError(f"Missing required columns in {GHCND_PATH.name}: {missing}. Have: {list(cdo.columns)}")

# Normalize date to UTC day
cdo[DATE_COL] = pd.to_datetime(cdo[DATE_COL], utc=True, errors="coerce").dt.normalize()

# Filter date window
start = pd.to_datetime(CONFIG["start_date"], utc=True).normalize()
end = pd.to_datetime(CONFIG["end_date"], utc=True).normalize()
cdo = cdo[(cdo[DATE_COL] >= start) & (cdo[DATE_COL] <= end)].copy()

# Drop invalid coords / missing precip
cdo = cdo[np.isfinite(cdo[LAT_COL]) & np.isfinite(cdo[LON_COL])].copy()
cdo = cdo[np.isfinite(cdo[PRCP_COL_IN])].copy()

print("Station-day rows:", len(cdo), "| stations:", cdo[STATION_COL].nunique(), "| dates:", cdo[DATE_COL].nunique())

# GeoDataFrame in OPS_EPSG
pts = gpd.GeoDataFrame(
    cdo,
    geometry=gpd.points_from_xy(cdo[LON_COL], cdo[LAT_COL]),
    crs=f"EPSG:{WGS84_EPSG}"
)
pts = ensure_epsg(pts, OPS_EPSG)

# Pre-group by date for speed (avoid repeated boolean filters)
pts_by_date = {d: df for d, df in pts.groupby(DATE_COL)}
all_dates = pd.date_range(start=start, end=end, freq="D")

# -----------------------------
# Daily loop with resume
# -----------------------------
written = 0
skipped = 0

for d in all_dates:
    tag = d.strftime("%Y%m%d")
    shard_path = SHARDS_DIR / f"final_prcp_{tag}.parquet"
    if shard_path.exists():
        skipped += 1
        continue

    day_pts = pts_by_date.get(d)
    interp = interpolate_prcp_for_day(centroids, day_pts)

    out = pd.DataFrame({
        GID_COL: grid[GID_COL].values,
        "date": np.full(len(grid), d),
    })
    out = pd.concat([out, interp], axis=1)

    out.to_parquet(shard_path, index=False)
    written += 1

print(f"Shards written: {written} | skipped (resume): {skipped} | total days: {len(all_dates)}")

# -----------------------------
# Compose FINAL parquet
# -----------------------------
shards = sorted(SHARDS_DIR.glob("final_prcp_*.parquet"))
if not shards:
    raise FileNotFoundError(f"No shards found in {SHARDS_DIR}")

df_final = pd.concat((pd.read_parquet(p) for p in shards), ignore_index=True)

df_final["date"] = pd.to_datetime(df_final["date"], utc=True).dt.normalize()
df_final = df_final.sort_values([GID_COL, "date"]).reset_index(drop=True)

FINAL_PATH.parent.mkdir(parents=True, exist_ok=True)
df_final.to_parquet(FINAL_PATH, index=False)

print("Saved FINAL:", FINAL_PATH)
print("Final rows:", len(df_final), "| cells:", df_final[GID_COL].nunique(), "| dates:", df_final["date"].nunique())
print(df_final.head())

GRID_PATH: /Users/Shared/blueleaflabs/hydropulse/results/grid_3000m_CA_epsg3310.parquet
GHCND_PATH: /Users/Shared/blueleaflabs/hydropulse/results/ghcnd_daily_cleaned.parquet
SHARDS_DIR: /Users/Shared/blueleaflabs/hydropulse/results/derived/final_daily_shards_prcp
FINAL_PATH: /Users/Shared/blueleaflabs/hydropulse/results/final_daily_grid_CA_3000m_epsg3310_20240601_20241031.parquet
Grid: 46495 cells | CRS EPSG: 3310 | id col: grid_id
Station-day rows: 122769 | stations: 1039 | dates: 153
Shards written: 153 | skipped (resume): 0 | total days: 153
Saved FINAL: /Users/Shared/blueleaflabs/hydropulse/results/final_daily_grid_CA_3000m_epsg3310_20240601_20241031.parquet
Final rows: 7113735 | cells: 46495 | dates: 153
             grid_id                      date   prcp_mm  n_used  maxdist_km  \
0  CA3310_3000_0_293 2024-06-01 00:00:00+00:00  0.000000       8   29.701700   
1  CA3310_3000_0_293 2024-06-02 00:00:00+00:00  0.001268       8   29.701700   
2  CA3310_3000_0_293 2024-06-03 00:00:00+