In [14]:
import logging
logging.basicConfig(
    level=logging.INFO,  # set to DEBUG for more detail
    format="%(asctime)s | %(levelname)-8s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger(__name__)

In [1]:
import pandas as pd
import geopandas as gpd

# File → variable name mapping
file_map = {
    "sr.csv": "df_sr",
    "sr_hex.csv": "df_sr_hex",
    "sr_hex_truncated.csv": "df_sr_hex_truncated",
    "city-hex-polygons-8.geojson": "gdf_city_hex_8"
}

# Load the files
for file_name, var_name in file_map.items():
    print(f"\nProcessing {file_name}...")

    if file_name.endswith(".csv"):
        df = pd.read_csv(file_name)
        globals()[var_name] = df

    elif file_name.endswith(".geojson"):
        gdf = gpd.read_file(file_name)
        globals()[var_name] = gdf

print("\nAll files loaded successfully.")



Processing sr.csv...

Processing sr_hex.csv...

Processing sr_hex_truncated.csv...

Processing city-hex-polygons-8.geojson...

All files loaded successfully.


In [15]:
import time
import numpy as np 
import h3

def assign_h3_level8(
    df: pd.DataFrame,
    lat_col: str = "latitude",
    long_col: str = "longitude",
    threshold: float = 0.05,
    resolution: int = 8,
    early_abort: bool = True,
) -> pd.DataFrame:
    """
    Assigns h3_level 8 based on latitude and longitude.

    Logs:
      - Row counts and join stats
    """
    # ---- Global timer start + resource snapshot ----
    t0 = time.perf_counter()

    if lat_col not in df.columns or long_col not in df.columns:
        raise ValueError(f"DataFrame must have '{lat_col}' and '{long_col}' columns.")

    df_out = df.copy()

    # converts lat and long to numpy, and gets failure rate.
    v0 = time.perf_counter()
    lat = pd.to_numeric(df_out[lat_col], errors="coerce").to_numpy()
    long = pd.to_numeric(df_out[long_col], errors="coerce").to_numpy()

    valid = (
        np.isfinite(lat) & np.isfinite(long) &
        (lat >= -90.0) & (lat <= 90.0) &
        (long >= -180.0) & (long <= 180.0)
    )
    n = lat.shape[0]
    n_invalid = int((~valid).sum())
    fail_rate_pre = (n_invalid / n) if n > 0 else 0.0

    if early_abort and fail_rate_pre > threshold:
        raise RuntimeError(
            f"Join failure rate {fail_rate_pre:.2%} exceeds threshold {threshold:.2%} "
            f"(invalid coords: {n_invalid}/{n}) after validation."
        )

    # ---- Phase: H3 compute ----
    c0 = time.perf_counter()
    out = np.empty(n, dtype=object)
    out.fill("0")

    if valid.any():
        lat_v = lat[valid]
        long_v = long[valid]
        res = [h3.latlng_to_cell(la, lo, resolution) for la, lo in zip(lat_v, long_v)]
        out[valid] = res

    # ---- Phase: assign to DataFrame ----
    df_out["h3_level8_index"] = out

    # ---- Join stats ----
    failed = int((df_out["h3_level8_index"] == "0").sum())
    success = n - failed
    fail_rate_post = failed / n if n > 0 else 0.0

    logger.debug("Total records: %d", n)
    logger.debug("Assigned hex IDs: %d", success)
    logger.debug("Failed to join: %d (%.2f%%)", failed, fail_rate_post * 100)

    # ---- Timing summary ----
    t1 = time.perf_counter()
    logger.info("Total elapsed: %.3f", t1 - t0)


    # ---- Threshold check ----
    if fail_rate_post > threshold:
        logger.error(
            "Post-compute failure rate %.2f%% exceeds threshold %.2f%%",
            fail_rate_post * 100,
            threshold * 100,
        )
        raise RuntimeError("Too many failed joins")

    return df_out


In [None]:
import pandas as pd

def compare_h3_by_notification(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    key: str = "notification_number",
    col: str = "h3_level8_index",
    coerce_to_str: bool = True,   # normalize dtypes before compare
    na_equal: bool = True,        # treat NaN==NaN as equal
):
    """
    Compare df1 vs df2 on a single column (h3_level8_index) by notification key.

    Returns dict with:
      - only_in_df1: keys present only in df1
      - only_in_df2: keys present only in df2
      - matches: keys in both where h3 matches (per na_equal rule)
      - mismatches: keys in both where h3 differs, with left/right values
    """
    if key not in df1.columns or key not in df2.columns:
        raise ValueError(f"Key '{key}' must exist in both DataFrames.")
    if col not in df1.columns or col not in df2.columns:
        raise ValueError(f"Column '{col}' must exist in both DataFrames.")

    left = df1[[key, col]].copy()
    right = df2[[key, col]].copy()

    if coerce_to_str:
        for d in (left, right):
            d[col] = d[col].astype("object").where(d[col].isna(), d[col].astype(str))

    merged = left.merge(
        right,
        on=key,
        how="outer",
        suffixes=("_df1", "_df2"),
        indicator=True,
    )

    only_in_df1 = merged.loc[merged["_merge"] == "left_only", [key]]
    only_in_df2 = merged.loc[merged["_merge"] == "right_only", [key]]

    both = merged.loc[merged["_merge"] == "both", [key, f"{col}_df1", f"{col}_df2"]].copy()

    lv = both[f"{col}_df1"]
    rv = both[f"{col}_df2"]
    equal_mask = (lv == rv)
    if na_equal:
        equal_mask = equal_mask | (lv.isna() & rv.isna())

    matches = both.loc[equal_mask, [key]]
    mismatches = both.loc[~equal_mask, [key, f"{col}_df1", f"{col}_df2"]].rename(
        columns={f"{col}_df1": f"{col}_left", f"{col}_df2": f"{col}_right"}
    )

    return {
        "only_in_df1": only_in_df1.reset_index(drop=True),
        "only_in_df2": only_in_df2.reset_index(drop=True),
        "matches": matches.reset_index(drop=True),
        "mismatches": mismatches.reset_index(drop=True),
    }


In [21]:
df_sr2 = assign_h3_level8(df_sr, lat_col="latitude", long_col="longitude", threshold=0.226)
df_sr2.drop(columns=["Unnamed: 0"], inplace=True)

2025-08-18 23:50:59 | INFO     | Total elapsed: 0.591


In [13]:


result = compare_h3_by_notification(df_sr2, df_sr_hex, key="notification_number", col="h3_level8_index")

print("Only in df1:", len(result["only_in_df1"]))
print("Only in df2:", len(result["only_in_df2"]))
print("Matches    :", len(result["matches"]))
print("Mismatches :", len(result["mismatches"]))


Only in df1: 0
Only in df2: 0
Matches    : 941634
Mismatches : 0
