In [3]:
from pathlib import Path
import numpy as np
import pandas as pd

# ---------------------------------------------------------
# Paths
# ---------------------------------------------------------
REPO_ROOT     = Path("/mnt/sda-21.8/bdevoe/greenspace")
PROCESSED_DIR = REPO_ROOT / "processed"

NDVI_RAW_CSV  = PROCESSED_DIR / "tract_ndvi_summer_2021_2025_summary.csv"
NDVI_LAND_CSV = PROCESSED_DIR / "tract_ndvi_landonly_ndwi0.20_2021_2025.csv"
NDWI_CSV      = PROCESSED_DIR / "tract_ndwi_summer_2021_2025_summary.csv"

# ---------------------------------------------------------
# Load data
# ---------------------------------------------------------
print("Loading raw NDVI:", NDVI_RAW_CSV)
ndvi_raw = pd.read_csv(NDVI_RAW_CSV)

print("Loading land-only NDVI:", NDVI_LAND_CSV)
ndvi_land = pd.read_csv(NDVI_LAND_CSV)

print("Loading NDWI:", NDWI_CSV)
ndwi = pd.read_csv(NDWI_CSV)

# Normalize GEOID
for df in (ndvi_raw, ndvi_land, ndwi):
    if "GEOID" not in df.columns and "geoid" in df.columns:
        df.rename(columns={"geoid": "GEOID"}, inplace=True)
    df["GEOID"] = df["GEOID"].astype(str)

# Merge all three
df = (
    ndvi_raw
    .merge(ndvi_land, on="GEOID", how="inner", suffixes=("", "_landfile"))
    .merge(ndwi, on="GEOID", how="inner", suffixes=("", "_ndwifile"))
)

print("\nMerged tracts:", len(df))

# ---------------------------------------------------------
# Helper for pairwise correlation + difference summary
# ---------------------------------------------------------
def corr_and_diff(df, col_raw, col_land, label):
    if col_raw not in df.columns or col_land not in df.columns:
        print(f"[skip] {label}: missing {col_raw} or {col_land}")
        return

    sub = df[[col_raw, col_land]].dropna()
    if sub.empty:
        print(f"[skip] {label}: no non-missing data")
        return

    r = sub[col_raw].corr(sub[col_land])
    diff = sub[col_land] - sub[col_raw]

    print("\n=== %s ===" % label)
    print("Columns: raw = %s, land = %s" % (col_raw, col_land))
    print("  N tracts:              %d" % len(sub))
    print("  Pearson r(raw, land):  %.4f" % r)
    print("  Mean diff (land-raw):  %.4f" % diff.mean())
    print("  SD diff (land-raw):    %.4f" % diff.std())
    for q in [1, 5, 25, 50, 75, 95, 99]:
        print("  %2dth pct diff:         %.4f" % (q, np.nanpercentile(diff, q)))

# ---------------------------------------------------------
# 1) Raw vs land-only NDVI: key stats
# ---------------------------------------------------------
print("\n-----------------------------")
print("NDVI raw vs land-only (per-stat correlations)")
print("-----------------------------")

pairs = [
    ("mean_ndvi",         "mean_ndvi_land",   "Mean NDVI"),
    ("median_ndvi",       "median_ndvi_land", "Median NDVI"),
    ("min_ndvi",          "min_ndvi_land",    "Min NDVI"),
    ("max_ndvi",          "max_ndvi_land",    "Max NDVI"),
    ("std_ndvi",          "std_ndvi_land",    "Std NDVI"),
]

for col_raw, col_land, label in pairs:
    corr_and_diff(df, col_raw, col_land, label)

# ---------------------------------------------------------
# 2) Small correlation matrix across key summaries
# ---------------------------------------------------------
# Try to include NDWI summaries if present
candidate_cols = [
    "mean_ndvi",
    "mean_ndvi_land",
    "median_ndvi",
    "median_ndvi_land",
    "std_ndvi",
    "std_ndvi_land",
    "min_ndvi",
    "min_ndvi_land",
    "max_ndvi",
    "max_ndvi_land",
    "mean_ndwi",
    "median_ndwi",
    "std_ndwi",
]

key_cols = [c for c in candidate_cols if c in df.columns]

print("\n-----------------------------")
print("Correlation matrix for key summaries")
print("-----------------------------")
print("Columns included:", key_cols)

if key_cols:
    corr_mat = df[key_cols].corr()
    print("\n", corr_mat)
else:
    print("No key summary columns found; nothing to correlate.")

print("\nDone.")


Loading raw NDVI: /mnt/sda-21.8/bdevoe/greenspace/processed/tract_ndvi_summer_2021_2025_summary.csv
Loading land-only NDVI: /mnt/sda-21.8/bdevoe/greenspace/processed/tract_ndvi_landonly_ndwi0.20_2021_2025.csv
Loading NDWI: /mnt/sda-21.8/bdevoe/greenspace/processed/tract_ndwi_summer_2021_2025_summary.csv

Merged tracts: 85187

-----------------------------
NDVI raw vs land-only (per-stat correlations)
-----------------------------

=== Mean NDVI ===
Columns: raw = mean_ndvi, land = mean_ndvi_land
  N tracts:              83509
  Pearson r(raw, land):  0.9888
  Mean diff (land-raw):  0.0062
  SD diff (land-raw):    0.0299
   1th pct diff:         0.0000
   5th pct diff:         0.0000
  25th pct diff:         0.0000
  50th pct diff:         0.0000
  75th pct diff:         0.0005
  95th pct diff:         0.0286
  99th pct diff:         0.1427

=== Median NDVI ===
Columns: raw = median_ndvi, land = median_ndvi_land
  N tracts:              83509
  Pearson r(raw, land):  0.9880
  Mean diff 