# Notebook 1
This notebook sets up the AWS details, downloads data from s3 using select and download relevant all files provided in the original readme.

In [None]:

from __future__ import annotations

# --- Stdlib ---
import contextlib
import gzip
import json
import logging
import os
import shutil
import time
import requests
from pathlib import Path
from typing import Dict, Iterable, Optional, Tuple

# --- AWS / S3 ---
import boto3
from botocore.config import Config 
from botocore.exceptions import ClientError

# --- Data Science ---
import pandas as pd
import geopandas as gpd

# --- Geo ---
from shapely.geometry import shape
from shapely.geometry.base import BaseGeometry

# --- Testing ---
import ipytest
import pytest

# --- Optional Diagnostics ---
try:
    import psutil as _psutil  # type: ignore
except Exception:
    _psutil = None

try:
    import tracemalloc as _tracemalloc  # type: ignore
except Exception:
    _tracemalloc = None

# --- Config ---
ipytest.autoconfig()

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(levelname)-8s | %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

logger = logging.getLogger(__name__)

def ensure_dir(path: str | os.PathLike) -> None:
    os.makedirs(path if isinstance(path, str) else str(path), exist_ok=True)

def parent_dir_of(path: str | os.PathLike) -> str:
    return os.path.dirname(str(path)) or "."

def fmt_bytes(n: int | float) -> str:
    try:
        n = float(n)
    except Exception:
        return str(n)
    for unit in ("B", "KB", "MB", "GB", "TB", "PB"):
        if abs(n) < 1024 or unit == "PB":
            return f"{n:.1f}{unit}"
        n /= 1024

def gunzip_file(src: str, dst: str) -> None:
    ensure_dir(parent_dir_of(dst))
    with gzip.open(src, "rb") as fin, open(dst, "wb") as fout:
        shutil.copyfileobj(fin, fout)

def client_error_msg(e) -> str:
    code = getattr(e, "response", {}).get("Error", {}).get("Code")
    msg  = getattr(e, "response", {}).get("Error", {}).get("Message")
    return f"[{code}] {msg}" if code or msg else str(e)

@contextlib.contextmanager
def timed(label: str, enabled: bool = True):
    t0 = time.perf_counter()
    try:
        yield
    finally:
        if enabled:
            logger.info("%s took %.3f s", label, time.perf_counter() - t0)

def resource_snapshot(note: str = "") -> None:
    if not logger.isEnabledFor(logging.DEBUG):
        return
    parts: list[str] = []
    if _psutil:
        try:
            p = _psutil.Process(os.getpid())
            parts.append(f"rss={fmt_bytes(p.memory_info().rss)}")
            parts.append(f"cpu%~{p.cpu_percent(interval=0.0):.1f}")
        except Exception:
            pass
    if _tracemalloc and _tracemalloc.is_tracing():
        try:
            cur, peak = _tracemalloc.get_traced_memory()
            parts.append(f"py_mem={fmt_bytes(cur)}/{fmt_bytes(peak)}(peak)")
        except Exception:
            pass
    if parts:
        logger.debug("RES%s %s", f"[{note}]" if note else "", " ".join(parts))
def download_file(
    url: str,
    save_as: str,
    overwrite: bool = False,
    dest_dir: str | None = None,
) -> str:
    """
    Downloads a file from a URL and saves it locally.

    Args:
        url (str): The URL to fetch.
        save_as (str): The filename to save as (e.g., 'credentials.json').
        overwrite (bool): Whether to overwrite if the file already exists.
        dest_dir (str | None): Optional destination directory. Defaults to cwd.

    Returns:
        str: Absolute path to the saved file.
    """
    dest_path = Path(dest_dir or ".").resolve() / save_as

    if dest_path.exists() and not overwrite:
        logger.info("download_file: file already exists → %s (skipped)", dest_path)
        return str(dest_path)

    with timed(f"Downloading {url}"):
        response = requests.get(url, timeout=30)
        response.raise_for_status()  # raise error if request fails

        ensure_dir(dest_path.parent)
        with open(dest_path, "wb") as f:
            f.write(response.content)

    logger.info("Downloaded %s → %s", url, dest_path)
    return str(dest_path)


download_file(
    url="https://cct-ds-code-challenge-input-data.s3.af-south-1.amazonaws.com/ds_code_challenge_creds.json",
    save_as="credentials.json"
)

        
        

DEST_ROOT = "../data"




Config set up

Functions to load credentials and boto client


In [None]:
def resolve_path(rel: str | None, filename: str) -> str:
    """
    """
    if rel:
        return str(Path(rel) / filename)
    return str(Path(DEST_ROOT) / filename)

def load_creds_from_file(credentials_path: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Reads {"s3": {"access_key": "...", "secret_key": "..."}} -> (ak, sk) or (None, None)
    """
    try:
        with open(credentials_path, "r") as f:
            data = json.load(f)
        ak = data.get("s3", {}).get("access_key")
        sk = data.get("s3", {}).get("secret_key")
        if ak and sk:
            logger.info("Loaded S3 credentials from %s", credentials_path)
            return ak, sk
        logger.info("Credentials file present but missing keys; falling back to env/role.")
    except FileNotFoundError:
        logger.info("No credentials file at %s; falling back to env/role.", credentials_path)
    except Exception as e:
        logger.warning("Failed reading credentials file: %s (falling back to env/role)", e)
    return None, None

def make_s3_client(
    *, region: str,
    credentials_path: str | None = None,
    addressing_style: str = "virtual",
    botocore_extra: Optional[dict] = None,
):
    ak = sk = None
    if credentials_path:
        ak, sk = load_creds_from_file(credentials_path)

    user_cfg = (botocore_extra or {}).get("config")
    config = user_cfg or Config(s3={"addressing_style": addressing_style})

    client_kwargs = {"region_name": region, "config": config}
    if ak and sk:
        client_kwargs.update(aws_access_key_id=ak, aws_secret_access_key=sk)
    if botocore_extra:
        client_kwargs.update({k: v for k, v in botocore_extra.items() if k != "config"})

    return boto3.client("s3", **client_kwargs)

S3 functions

In [None]:
def s3_select_to_file(
    *, s3, bucket: str, key: str, sql: str, out_path: str,
    input_is_json_document: bool = True,
    output_record_delimiter: str = "\n",
    log_stats: bool = True,
):
    input_ser  = {"JSON": {"Type": "DOCUMENT"}} if input_is_json_document else {}
    output_ser = {"JSON": {"RecordDelimiter": output_record_delimiter}}
    delim = output_record_delimiter.encode("utf-8")

    with timed("S3 Select open"):
        try:
            resp = s3.select_object_content(
                Bucket=bucket, Key=key,
                ExpressionType="SQL", Expression=sql,
                InputSerialization=input_ser,
                OutputSerialization=output_ser,
            )
        except ClientError as e:
            raise RuntimeError(f"S3 Select failed {client_error_msg(e)}") from e

    ensure_dir(parent_dir_of(out_path))
    start = time.perf_counter()
    events = rows = 0
    tail = b""
    bytes_returned = None

    with open(out_path, "wb", buffering=8 * 1024 * 1024) as fout:
        for event in resp["Payload"]:
            events += 1
            rec = event.get("Records")
            if rec:
                data = tail + rec["Payload"]
                last = data.rfind(delim)
                if last != -1:
                    fout.write(memoryview(data)[: last + len(delim)])
                    rows += data.count(delim, 0, last + 1)
                    tail = data[last + len(delim):]
                else:
                    tail = data
            elif "Stats" in event and log_stats:
                d = event["Stats"]["Details"]
                bytes_returned = d.get("BytesReturned", bytes_returned)
                logger.info(
                    "Stats: scanned=%s processed=%s returned=%s",
                    fmt_bytes(d.get("BytesScanned", 0)),
                    fmt_bytes(d.get("BytesProcessed", 0)),
                    fmt_bytes(d.get("BytesReturned", 0)),
                )
            if events % 200 == 0:
                resource_snapshot(f"s3select events={events}")

        if tail:
            if not tail.endswith(delim):
                fout.write(tail + delim); rows += 1
            else:
                fout.write(tail); rows += tail.count(delim)

    wall = time.perf_counter() - start
    logger.info("S3 Select complete | events=%d rows~=%.0f time=%.3fs out=%s",
                events, rows, wall, out_path)
    return {"events": events, "rows_est": rows, "bytes_returned": bytes_returned,
            "wall_time_s": wall, "out_path": out_path}


def download_unpack_s3_files(
    *, s3, bucket: str, keys: Iterable[str],
    dest_root: str | None = None,
    overwrite: bool = False,
    preserve_structure: bool = True,
):
    """
    Download specific S3 keys into a local folder.
    If a key ends with plain '.gz' (not '.tar.gz'), decompress it to the same
    path minus the '.gz' suffix, overwriting if it exists.

    Returns a dict with lists of downloaded, skipped, and error items.
    """
    import os, gzip, shutil, logging
    from pathlib import Path

    logger = logging.getLogger(__name__)

    root = Path(dest_root or DEST_ROOT)
    ensure_dir(root)

    downloaded, skipped, errors = [], [], []

    logger.info("Starting S3 download | bucket=%s keys=%d dest_root=%s overwrite=%s preserve_structure=%s",
                bucket, len(keys), root, overwrite, preserve_structure)

    for key in keys:
        local_rel = key if preserve_structure else os.path.basename(key)
        local_path = root / local_rel
        ensure_dir(local_path.parent)

        if local_path.exists() and not overwrite:
            skipped.append({"key": key, "path": str(local_path), "reason": "exists"})
            logger.info("Skipping (exists): s3://%s/%s -> %s", bucket, key, local_path)
            continue

        try:
            logger.info("Downloading: s3://%s/%s -> %s", bucket, key, local_path)
            s3.download_file(bucket, key, str(local_path))
            downloaded.append({"key": key, "path": str(local_path)})
        except Exception as e:
            msg = client_error_msg(e)
            logger.error("Download failed: s3://%s/%s -> %s | %s", bucket, key, local_path, msg)
            errors.append({"key": key, "error": msg})
            continue


        lp = local_path.name.lower()
        if lp.endswith(".gz") and not lp.endswith(".tar.gz"):
            target_path = local_path.with_suffix("") 
            try:
                logger.info("Decompressing: %s -> %s", local_path, target_path)
                gunzip_file(str(local_path), str(target_path))
            except Exception as e:
                logger.error("Decompression failed for %s: %s", local_path, e)
                errors.append({"key": key, "error": f"gunzip: {e}"})

    logger.info("S3 download complete | downloaded=%d skipped=%d errors=%d dest_root=%s",
                len(downloaded), len(skipped), len(errors), root)

    return {
        "downloaded": downloaded,
        "skipped": skipped,
        "errors": errors,
        "dest_root": str(root),
    }



Geopanda functions

In [None]:

def gpd_from_jsonl(
    path: str | None = None,
    *,
    filename: str = "hex8_features.jsonl",
    dest_root: str | None = None,
    crs: str = "EPSG:4326",
    sep: str = ".",
    sample_logs: int = 3,
    log_every_n: int = 500,
) -> gpd.GeoDataFrame:
    """
    Load a GeoDataFrame from JSON Lines where each line is one record in any of:
      • Feature: {"type":"Feature","properties":{...},"geometry":{...}}
      • Wrapped: {"s": {Feature...}} or any single-key wrapper containing a Feature
      • Columnar: {"_1": properties, "_2": geometry}

    Resolution order for the input:
      1) If `path` is provided, use it.
      2) Else use (dest_root or DEST_ROOT)/filename.
      3) Else try ./filename.

    Returns
    -------
    GeoDataFrame (CRS=`crs`) with shapely geometry and flattened properties.
    Logs INFO timings and DEBUG progress/resource snapshots (if DEBUG enabled).
    """
    import json, logging, os, time
    from pathlib import Path
    from typing import Any
    import pandas as pd
    import geopandas as gpd
    from shapely.geometry import shape

    logger = logging.getLogger(__name__)

    # --- resolve input path using provided dest_root or global DEST_ROOT fallback ---
    DEST_ROOT_FALLBACK = Path(globals().get("DEST_ROOT", "../data"))
    candidates: list[Path] = []
    if path:
        candidates.append(Path(path))
    else:
        root = Path(dest_root) if dest_root else DEST_ROOT_FALLBACK
        candidates.append(root / filename)
        candidates.append(Path(filename))  # local fallback

    resolved: str | None = None
    for p in candidates:
        if p.exists():
            resolved = str(p)
            break
    if not resolved:
        raise FileNotFoundError(f"Could not find {filename}. Tried: {', '.join(map(str, candidates))}")

    # --- optional file size for context ---
    try:
        sz = Path(resolved).stat().st_size
        logger.debug("Input JSONL: %s (%s)", resolved, fmt_bytes(sz))
    except Exception:
        logger.debug("Input JSONL: %s", resolved)

    # --- timers ---
    t0_wall = time.perf_counter()
    t0_cpu = time.process_time()

    resource_snapshot("gpd_from_jsonl:start")

    feats: list[dict[str, Any]] = []
    wrapped = colstyle = skipped = json_errors = 0
    n_lines = 0
    last_log_wall = time.perf_counter()

    # --- read & parse lines ---
    with open(resolved, "r", encoding="utf-8") as f:
        for ln, raw in enumerate(f, 1):
            n_lines += 1
            s = raw.strip()
            if not s:
                continue

            try:
                obj = json.loads(s)
            except Exception as e:
                json_errors += 1
                if logger.isEnabledFor(logging.DEBUG) and json_errors <= sample_logs:
                    logger.debug("Line %d: JSON decode error: %s", ln, e)
                continue

            rec = None
            # Direct Feature
            if isinstance(obj, dict) and obj.get("type") == "Feature" and "geometry" in obj and "properties" in obj:
                rec = obj
            # Known wrappers
            elif isinstance(obj, dict) and "s" in obj and isinstance(obj["s"], dict):
                rec = obj["s"]; wrapped += 1
            elif isinstance(obj, dict) and "_1" in obj and "_2" in obj:
                rec = {"type": "Feature", "properties": obj["_1"], "geometry": obj["_2"]}; colstyle += 1
            else:
                # Any single-key wrapper with a Feature-like payload
                if isinstance(obj, dict):
                    for v in obj.values():
                        if isinstance(v, dict) and "geometry" in v and "properties" in v:
                            rec = v; wrapped += 1
                            break

            if not (isinstance(rec, dict) and "geometry" in rec and "properties" in rec):
                skipped += 1
                if logger.isEnabledFor(logging.DEBUG) and skipped <= sample_logs:
                    logger.debug("Line %d: no usable 'geometry'/'properties' wrapper found", ln)
                continue

            feats.append(rec)

            # periodic DEBUG progress & resources
            if logger.isEnabledFor(logging.DEBUG) and (n_lines % log_every_n == 0):
                now = time.perf_counter()
                rate = log_every_n / max(1e-9, (now - last_log_wall))
                last_log_wall = now
                logger.debug("Progress: %d lines, %d features (%.0f l/s)", n_lines, len(feats), rate)
                resource_snapshot(f"gpd_from_jsonl:line={n_lines}")

    # --- build GeoDataFrame ---
    t1 = time.perf_counter()
    props = [feat.get("properties", {}) for feat in feats]
    geoms = []
    none_geom = 0
    for feat in feats:
        g = feat.get("geometry")
        if g is None:
            geoms.append(None); none_geom += 1
        else:
            try:
                geoms.append(shape(g))
            except Exception:
                geoms.append(None); none_geom += 1
    t2 = time.perf_counter()

    df_props = pd.json_normalize(props, sep=sep)
    t3 = time.perf_counter()

    gdf = gpd.GeoDataFrame(df_props, geometry=geoms, crs=crs)
    t4 = time.perf_counter()

    resource_snapshot("gpd_from_jsonl:end-parse+build")

    # --- timings ---
    wall_total = time.perf_counter() - t0_wall
    cpu_total = time.process_time() - t0_cpu
    parse_wall = t1 - t0_wall
    geom_wall  = t2 - t1
    norm_wall  = t3 - t2
    gdf_wall   = t4 - t3

    logger.info(
        "Loaded %d features from %s | wall=%.3fs cpu=%.3fs | parse=%.3fs geom=%.3fs normalize=%.3fs geodf=%.3fs | "
        "wrapped=%d columnar=%d skipped=%d json_errors=%d none_geom=%d",
        len(feats), resolved, wall_total, cpu_total, parse_wall, geom_wall, norm_wall, gdf_wall,
        wrapped, colstyle, skipped, json_errors, none_geom
    )

    return gdf


def _ensure_key_col(gdf: gpd.GeoDataFrame, key: str = "index") -> gpd.GeoDataFrame:
    return gdf if key in gdf.columns else gdf.reset_index().rename(columns={"index": key})

def compare_hex_gdfs_simple(
    left: gpd.GeoDataFrame,
    right: gpd.GeoDataFrame,
    key: str = "index",
    geom_tolerance: float = 0.0,  
    na_equal: bool = True,
) -> Dict[str, pd.DataFrame]:

    left  = _ensure_key_col(left, key)
    right = _ensure_key_col(right, key)

    required = {key, "centroid_lat", "centroid_lon", "geometry"}
    for name, df in (("left", left), ("right", right)):
        missing = sorted(required - set(df.columns))
        if missing:
            raise ValueError(f"{name} GeoDataFrame missing columns: {missing}")


    geom_l = left.geometry.name
    geom_r = right.geometry.name
    L = left[[key, "centroid_lat", "centroid_lon", geom_l]].copy()
    R = right[[key, "centroid_lat", "centroid_lon", geom_r]].copy()

    # Align CRS (reproject right -> left if both set and differ)
    if getattr(left, "crs", None) and getattr(right, "crs", None) and left.crs != right.crs:
        R = gpd.GeoDataFrame(R, geometry=geom_r, crs=right.crs).to_crs(left.crs)

    # Merge on key
    m = L.merge(R, on=key, how="outer", suffixes=("_l", "_r"), indicator=True)

    only_in_left  = m.loc[m["_merge"] == "left_only",  [key]].reset_index(drop=True)
    only_in_right = m.loc[m["_merge"] == "right_only", [key]].reset_index(drop=True)
    both = m.loc[m["_merge"] == "both"].copy()

    # Element-wise attr equality
    def _eq(a: pd.Series, b: pd.Series) -> pd.Series:
        out = (a == b)
        return out | (a.isna() & b.isna()) if na_equal else out

    lat_eq = _eq(both["centroid_lat_l"], both["centroid_lat_r"])
    lon_eq = _eq(both["centroid_lon_l"], both["centroid_lon_r"])

    # Element-wise geometry equality
    g1 = gpd.GeoSeries(both[f"{geom_l}_l"], crs=left.crs)
    g2 = gpd.GeoSeries(both[f"{geom_r}_r"], crs=left.crs)  # already reprojected if needed

    if geom_tolerance <= 0:
        geom_eq = g1.geom_equals(g2)
        if na_equal:
            geom_eq = geom_eq | (g1.isna() & g2.isna())
    else:
        def _within(a: BaseGeometry, b: BaseGeometry) -> bool:
            if a is None or b is None:
                return na_equal and (a is None and b is None)
            try:
                return a.distance(b) <= geom_tolerance
            except Exception:
                return False
        geom_eq = pd.Series([_within(a, b) for a, b in zip(g1.values, g2.values)], index=both.index)


    all_eq = lat_eq & lon_eq & geom_eq
    matches = both.loc[all_eq, [key]].reset_index(drop=True)
    diffs   = both.loc[~all_eq].copy()


    rows = []
    if (~lat_eq).any():
        t = diffs.loc[diffs.index.intersection(both.index[~lat_eq]), [key, "centroid_lat_l", "centroid_lat_r"]].copy()
        t.insert(1, "column", "centroid_lat")
        t.rename(columns={"centroid_lat_l": "left", "centroid_lat_r": "right"}, inplace=True)
        rows.append(t)
    if (~lon_eq).any():
        t = diffs.loc[diffs.index.intersection(both.index[~lon_eq]), [key, "centroid_lon_l", "centroid_lon_r"]].copy()
        t.insert(1, "column", "centroid_lon")
        t.rename(columns={"centroid_lon_l": "left", "centroid_lon_r": "right"}, inplace=True)
        rows.append(t)
    if (~geom_eq).any():
        t = diffs.loc[diffs.index.intersection(both.index[~geom_eq]), [key, f"{geom_l}_l", f"{geom_r}_r"]].copy()
        t.insert(1, "column", "geometry")
        t["left"]  = gpd.GeoSeries(t.pop(f"{geom_l}_l"), crs=left.crs).to_wkt()
        t["right"] = gpd.GeoSeries(t.pop(f"{geom_r}_r"), crs=left.crs).to_wkt()
        rows.append(t[[key, "column", "left", "right"]])

    mismatches_long = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame(columns=[key, "column", "left", "right"])


    if not diffs.empty:
        keep = [key, "centroid_lat_l", "centroid_lat_r", "centroid_lon_l", "centroid_lon_r", f"{geom_l}_l", f"{geom_r}_r"]
        mismatches_wide = diffs[[c for c in keep if c in diffs.columns]].reset_index(drop=True)
    else:
        mismatches_wide = pd.DataFrame(columns=[key])

    return {
        "only_in_left": only_in_left,
        "only_in_right": only_in_right,
        "matches": matches,
        "mismatches_long": mismatches_long,
        "mismatches_wide": mismatches_wide,
    }
import logging

def validate_hex_gdf(gdf):
    """
    Run data quality checks on a GeoDataFrame of H3 polygons.

    Expected columns:
        - index (H3 index)
        - centroid_lat
        - centroid_lon
        - resolution
        - geometry
    """

    logging.info("Running data quality checks...")
    results = {}

    # --- 1) Missing values ---
    n_missing = gdf.isna().sum()
    results["missing_values"] = n_missing.to_dict()
    if n_missing.sum() > 0:
        logging.warning(" Found NaNs:\n%s", n_missing[n_missing > 0])
    else:
        logging.info(" No missing values detected.")

    # --- 2) Resolution consistency ---
    bad_res = gdf[gdf["resolution"] != 8]
    results["bad_resolution_count"] = len(bad_res)
    if not bad_res.empty:
        logging.warning(" Rows with resolution != 8: %d", len(bad_res))
    else:
        logging.info(" All rows have resolution = 8.")

    # --- 3) Geometry validity ---
    invalid_geom = gdf[~gdf.is_valid]
    results["invalid_geometry_count"] = len(invalid_geom)
    if not invalid_geom.empty:
        logging.warning(" Invalid geometries found: %d", len(invalid_geom))
    else:
        logging.info(" All geometries are valid.")

    # --- 4) Centroid plausibility ---
    lat_out_of_range = gdf[(gdf["centroid_lat"] < -90) | (gdf["centroid_lat"] > 90)]
    lon_out_of_range = gdf[(gdf["centroid_lon"] < -180) | (gdf["centroid_lon"] > 180)]
    results["bad_lat_count"] = len(lat_out_of_range)
    results["bad_lon_count"] = len(lon_out_of_range)

    if results["bad_lat_count"] > 0 or results["bad_lon_count"] > 0:
        logging.warning(
            " Centroids out of bounds: lat=%d, lon=%d",
            results["bad_lat_count"],
            results["bad_lon_count"],
        )
    else:
        logging.info(" All centroids are within valid lat/lon ranges.")

    # --- 5) Duplicate H3 indexes ---
    dupes = gdf["index"].duplicated().sum()
    results["duplicate_index_count"] = int(dupes)
    if dupes > 0:
        logging.warning(" Duplicate H3 indexes found: %d", dupes)
    else:
        logging.info(" No duplicate H3 indexes.")

    # --- Summary ---
    logging.info(" Data quality check complete.")
    return results






local file loading function

In [None]:
def load_project_files(
    file_map: dict[str, str],
    project_root_name: str = "ds_code_challenge",
    inject_globals: bool = True,
):
    """
    Load files from ROOT/data based on a {file_name: variable_name} map.

    Supports:
      - .csv / .csv.gz  -> pandas DataFrame
      - .geojson / .geojson.gz -> GeoDataFrame
      - .ods -> Excel (odf engine)
    """
    with timed("load_project_files (resolve ROOT/DATA_DIR)"):
        ROOT = Path(__file__).resolve().parents[0] if "__file__" in globals() else Path().resolve()
        while ROOT.name != project_root_name and ROOT.parent != ROOT:
            ROOT = ROOT.parent
        DATA_DIR = ROOT / "data"

    results: dict[str, pd.DataFrame | gpd.GeoDataFrame] = {}
    resource_snapshot("load_project_files:start")

    for file_name, var_name in file_map.items():
        file_path = DATA_DIR / file_name
        logger.info("Processing %s...", file_path)

        with timed(f"read:{file_path.name}"):
            suffix = "".join(file_path.suffixes).lower()

            if suffix.endswith((".csv", ".csv.gz")):
                df = pd.read_csv(file_path)
                results[var_name] = df
                logger.info("→ %s loaded as DataFrame shape=%s", var_name, getattr(df, "shape", None))

            elif suffix.endswith((".geojson", ".geojson.gz")):
                gdf = gpd.read_file(file_path)
                results[var_name] = gdf
                logger.info("→ %s loaded as GeoDataFrame len=%d", var_name, len(gdf))

            elif suffix.endswith(".ods"):
                df = pd.read_excel(file_path, engine="odf")
                results[var_name] = df
                logger.info("→ %s loaded from ODS shape=%s", var_name, getattr(df, "shape", None))

            else:
                logger.warning("Skipping unsupported file type: %s", file_path)
                continue

            if inject_globals:
                globals()[var_name] = results[var_name]

            resource_snapshot(f"after_load:{file_path.name}")

    logger.info("All files loaded successfully.")
    resource_snapshot("load_project_files:end")
    return results



In [None]:
REGION = "af-south-1"
BUCKET = "cct-ds-code-challenge-input-data"
FILE_NAME    = "city-hex-polygons-8-10.geojson"
OUT_JSONL = "../data/hex8_features.jsonl"  
CREDENTIALS_PATH = "credentials.json"

ADDRESSING_STYLE = "virtual"

s3 = make_s3_client(region=REGION, credentials_path=CREDENTIALS_PATH, addressing_style=ADDRESSING_STYLE)


SQL = (
    """
    SELECT s
    FROM S3Object[*].features[*] s
    WHERE s.properties.resolution = 8
    """
)
select_summary = s3_select_to_file(
    s3=s3,
    bucket=BUCKET,
    key=FILE_NAME,
    sql=SQL,
    out_path=OUT_JSONL,
)

In [None]:
FILES_TO_DOWNLOAD = [
    "city-hex-polygons-8.geojson",
    "city-hex-polygons-8-10.geojson",
    "sr.csv.gz",
    "sr_hex_truncated.csv",
    "sr_hex.csv.gz"
]

download_summary = download_unpack_s3_files(
    s3=s3,
    bucket=BUCKET,
    keys=FILES_TO_DOWNLOAD,
    dest_root=DEST_ROOT,
    overwrite=False,
    preserve_structure=True,
)

In [None]:
file_map = {
    "city-hex-polygons-8.geojson": "gdf_city_hex_8"
}

load_project_files(file_map)


gdf = gpd_from_jsonl()
res = compare_hex_gdfs_simple(gdf_city_hex_8, gdf, key="index", geom_tolerance=0.0) 

print("Only in left:", len(res["only_in_left"]))
print("Only in right:", len(res["only_in_right"]))
print("Matches:", len(res["matches"]))



In [None]:
validation_results = validate_hex_gdf(gdf)

# Unit tests

In [None]:
%%ipytest -q


# --- Configure logging for the tests ---
logger = logging.getLogger(__name__)
if not logger.handlers:
    logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(message)s")

# --- Helpers ---
def _write_text(p: Path, s: str):
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_text(s, encoding="utf-8")

def _gz_write_bytes(p: Path, b: bytes):
    p.parent.mkdir(parents=True, exist_ok=True)
    with gzip.open(p, "wb") as f:
        f.write(b)

# ---------- Tests ----------

def test_ensure_dir_and_parent_dir_of(tmp_path: Path, caplog):
    logging.info("TEST: ensure_dir / parent_dir_of")
    d = tmp_path / "a" / "b"
    ensure_dir(str(d))
    assert d.exists() and d.is_dir()
    assert parent_dir_of(str(d / "file.txt")).endswith(os.sep.join(["a","b"]))

@pytest.mark.parametrize("val,expected", [
    (0, "0.0B"),
    (1023, "1023.0B"),
    (1024, "1.0KB"),
    (1024**2, "1.0MB"),
    ("oops", "oops"),
])
def test_fmt_bytes(val, expected):
    logging.info("TEST: fmt_bytes")
    assert fmt_bytes(val) == expected

def test_gunzip_file_roundtrip(tmp_path: Path):
    logging.info("TEST: gunzip_file")
    src_gz = tmp_path / "data.txt.gz"
    dst = tmp_path / "out.txt"
    payload = b"hello gunzip\n"
    _gz_write_bytes(src_gz, payload)
    gunzip_file(str(src_gz), str(dst))
    assert dst.read_bytes() == payload

def test_client_error_msg_happy_path():
    logging.info("TEST: client_error_msg")
    class E:
        response = {"Error": {"Code": "NoSuchKey", "Message": "missing"}}
    assert client_error_msg(E()) == "[NoSuchKey] missing"

def test_client_error_msg_fallback():
    logging.info("TEST: client_error_msg (fallback)")
    class E: pass
    e = E()
    assert client_error_msg(e) == str(e)

def test_timed_logs(caplog):
    logging.info("TEST: timed")
    caplog.set_level(logging.INFO)
    with timed("quick task", enabled=True):
        time.sleep(0.01)
    assert any("quick task took" in rec.getMessage() for rec in caplog.records)

def test_resource_snapshot_doesnt_crash_and_debugs(caplog):
    logging.info("TEST: resource_snapshot")
    caplog.set_level(logging.DEBUG)

    # Inject fakes into the function's globals so it can "see" psutil/tracemalloc
    fn_g = resource_snapshot.__globals__
    class _P:
        class _Proc:
            def __init__(self, *_): pass
            def memory_info(self):
                class M: rss = 123456
                return M()
            def cpu_percent(self, interval=0.0): return 1.23
        def Process(self, pid): return self._Proc()
    class _T:
        def is_tracing(self): return True
        def get_traced_memory(self): return (1000, 2000)
    fn_g["_psutil"] = _P()
    fn_g["_tracemalloc"] = _T()

    resource_snapshot("unit")
    # Should log DEBUG with "RES[unit]"
    assert any("RES[unit]" in rec.getMessage() for rec in caplog.records if rec.levelno == logging.DEBUG)

def test_resolve_path_uses_rel_or_DEST_ROOT(tmp_path: Path, monkeypatch):
    logging.info("TEST: resolve_path")
    # with rel
    p = resolve_path(str(tmp_path), "file.txt")
    assert p == str(tmp_path / "file.txt")
    # without rel -> uses DEST_ROOT
    fn_g = resolve_path.__globals__
    fn_g["DEST_ROOT"] = str(tmp_path / "root")
    q = resolve_path(None, "x.bin")
    assert q == str((tmp_path / "root" / "x.bin"))

def test_load_creds_from_file_ok(tmp_path: Path, caplog):
    logging.info("TEST: load_creds_from_file (ok)")
    caplog.set_level(logging.INFO)
    creds = {"s3": {"access_key": "AK", "secret_key": "SK"}}
    p = tmp_path / "credentials.json"
    _write_text(p, json.dumps(creds))
    ak, sk = load_creds_from_file(str(p))
    assert (ak, sk) == ("AK", "SK")
    assert any("Loaded S3 credentials" in rec.getMessage() for rec in caplog.records)

def test_load_creds_from_file_missing(tmp_path: Path, caplog):
    logging.info("TEST: load_creds_from_file (missing)")
    caplog.set_level(logging.INFO)
    ak, sk = load_creds_from_file(str(tmp_path / "nope.json"))
    assert (ak, sk) == (None, None)
    assert any("No credentials file" in rec.getMessage() for rec in caplog.records)

def test_make_s3_client_injects_kwargs(monkeypatch):
    logging.info("TEST: make_s3_client")
    calls = {}
    def fake_client(name, **kwargs):
        calls["service"] = name
        calls["kwargs"] = kwargs
        class D: pass
        return D()
    monkeypatch.setattr("boto3.client", fake_client)
    # also provide a fake Config type if the real one isn't present
    if "Config" not in make_s3_client.__globals__:
        class _C:
            def __init__(self, **kw): self.kw = kw
        make_s3_client.__globals__["Config"] = _C

    c = make_s3_client(region="af-south-1", credentials_path=None, addressing_style="virtual")
    assert calls["service"] == "s3"
    assert calls["kwargs"]["region_name"] == "af-south-1"
    assert "config" in calls["kwargs"]

def test_s3_select_to_file_writes_records(tmp_path: Path, caplog):
    logging.info("TEST: s3_select_to_file")
    caplog.set_level(logging.INFO)
    class FakeS3:
        def select_object_content(self, **kw):
            # Emit two record chunks and stats, like AWS does
            payload = [
                {"Records": {"Payload": b'{"a":1}\n{"a":2}\n{"a":3}'}},
                {"Records": {"Payload": b'\n{"a":4}\n{"a":5}'}},
                {"Stats": {"Details": {"BytesScanned": 10, "BytesProcessed": 10, "BytesReturned": 10}}},
                {"End": {}},
            ]
            return {"Payload": payload}
    out = tmp_path / "out.jsonl"
    r = s3_select_to_file(
        s3=FakeS3(), bucket="b", key="k", sql="SELECT * FROM s3object", out_path=str(out),
        input_is_json_document=True, output_record_delimiter="\n"
    )
    lines = out.read_text(encoding="utf-8").strip().splitlines()
    assert len(lines) == 5
    assert r["rows_est"] == 5
    assert any("Stats: scanned=" in rec.getMessage() for rec in caplog.records)

def test_download_unpack_s3_files_downloads_and_gunzips(tmp_path: Path, caplog):
    logging.info("TEST: download_unpack_s3_files")
    caplog.set_level(logging.INFO)
    class FakeS3:
        def __init__(self): self.calls=[]
        def download_file(self, bucket, key, dest):
            self.calls.append((bucket,key,dest))
            p = Path(dest); p.parent.mkdir(parents=True, exist_ok=True)
            if key.endswith(".gz") and not key.endswith(".tar.gz"):
                _gz_write_bytes(p, b"gzdata")
            else:
                _write_text(p, "plain")
    keys = ["a/b/plain.txt", "a/c/file.txt.gz", "a/d/archive.tar.gz"]
    res = download_unpack_s3_files(s3=FakeS3(), bucket="bucket", keys=keys, dest_root=str(tmp_path))
    # Files exist
    assert (tmp_path / "a" / "b" / "plain.txt").exists()
    assert (tmp_path / "a" / "c" / "file.txt").exists()             # gunzipped
    assert (tmp_path / "a" / "d" / "archive.tar.gz").exists()       # not gunzipped due to .tar.gz
    assert len(res["downloaded"]) == 3
    assert any("Decompressing:" in rec.getMessage() for rec in caplog.records)

@pytest.mark.skipif(pytest.importorskip("geopandas", reason="geopandas required") is None, reason="geopandas missing")
def test_gpd_from_jsonl_parses_minimal(tmp_path: Path, caplog):
    logging.info("TEST: gpd_from_jsonl")
    import geopandas as gpd  # noqa: F401
    data = [
        json.dumps({"type":"Feature","properties":{"index":"hex1","centroid_lat":-33.9,"centroid_lon":18.6,"resolution":8},"geometry":{"type":"Point","coordinates":[18.6,-33.9]}}),
        json.dumps({"s":{"type":"Feature","properties":{"index":"hex2","centroid_lat":-33.8,"centroid_lon":18.7,"resolution":8},"geometry":{"type":"Point","coordinates":[18.7,-33.8]}}}),
        "not json",
    ]
    p = tmp_path / "hex8_features.jsonl"
    _write_text(p, "\n".join(data))
    caplog.set_level(logging.INFO)
    gdf = gpd_from_jsonl(path=str(p))
    assert {"index","centroid_lat","centroid_lon","resolution","geometry"} <= set(gdf.columns)
    assert len(gdf) == 2

@pytest.mark.skipif(pytest.importorskip("geopandas", reason="geopandas required") is None, reason="geopandas missing")
def test_compare_hex_gdfs_simple_matches_and_diffs(tmp_path: Path):
    logging.info("TEST: compare_hex_gdfs_simple")
    import geopandas as gpd
    from shapely.geometry import Polygon

    left = gpd.GeoDataFrame(
        {"index":["h1","h2"], "centroid_lat":[-1.0,-2.0], "centroid_lon":[10.0,20.0]},
        geometry=[Polygon([(0,0),(1,0),(1,1),(0,1)]), Polygon([(2,2),(3,2),(3,3),(2,3)])],
        crs="EPSG:4326",
    )
    right = left.copy()
    # Introduce a difference
    right.loc[1, "centroid_lon"] = 21.0

    res = compare_hex_gdfs_simple(left, right, key="index")
    assert set(res.keys()) == {"only_in_left","only_in_right","matches","mismatches_long","mismatches_wide"}
    assert res["matches"]["index"].tolist() == ["h1"]
    assert res["mismatches_long"]["column"].tolist() == ["centroid_lon"]

@pytest.mark.skipif(pytest.importorskip("geopandas", reason="geopandas required") is None, reason="geopandas missing")
def test_validate_hex_gdf_flags_ok(tmp_path: Path, caplog):
    logging.info("TEST: validate_hex_gdf")
    import geopandas as gpd
    from shapely.geometry import Point
    gdf = gpd.GeoDataFrame(
        {"index":["x"], "centroid_lat":[-33.9], "centroid_lon":[18.6], "resolution":[8]},
        geometry=[Point(18.6,-33.9)],
        crs="EPSG:4326",
    )
    caplog.set_level(logging.INFO)
    r = validate_hex_gdf(gdf)
    assert r["bad_resolution_count"] == 0
    assert r["invalid_geometry_count"] == 0
    assert r["duplicate_index_count"] == 0

def test_load_project_files_reads_csv_and_geojson(tmp_path: Path, monkeypatch, caplog):
    logging.info("TEST: load_project_files")
    caplog.set_level(logging.INFO)

    # Build fake project tree: <tmp>/ds_code_challenge/data/{sr.csv, poly.geojson}
    proj = tmp_path / "ds_code_challenge"
    data = proj / "data"
    data.mkdir(parents=True)
    # Simple CSV
    _write_text(data / "sr.csv", "a,b\n1,2\n")
    # Minimal GeoJSON FeatureCollection
    geojson = {
        "type": "FeatureCollection",
        "features": [
            {"type":"Feature","properties":{"p":1},"geometry":{"type":"Point","coordinates":[0,0]}},
        ],
    }
    _write_text(data / "hex.geojson", json.dumps(geojson))

    # Change CWD into project so Path().resolve() ascends correctly
    monkeypatch.chdir(proj)

    # geopandas may be required; if unavailable, skip geojson path gracefully by catching ImportError
    try:
        out = load_project_files({"sr.csv":"df_sr","hex.geojson":"df_hex"})
        assert "df_sr" in out
        # df_hex present only if geopandas installed
        if "df_hex" in out:
            assert hasattr(out["df_hex"], "geometry")
    except Exception as e:
        # If failure is purely due to geopandas import, mark xpass-like
        if "geopandas" in str(e).lower():
            pytest.xfail("geopandas not installed; geojson part skipped")
        else:
            raise
