In [2]:

"""
Aggregate NDVI & EVI (2010) to Census tracts (New England)
- Computes mean NDVI and EVI
- Computes NDVI threshold metric: % pixels with NDVI >= 0.65
- Handles EE Int16 x10000 rasters automatically
- Outputs CSV + simple histograms to data/processed

Install once:
    pip install geopandas rasterio rasterstats shapely pyproj tqdm matplotlib
"""

from pathlib import Path
import re
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterstats import zonal_stats
from tqdm import tqdm
import matplotlib.pyplot as plt

# -------------------- CONFIG --------------------
BASE_DIR = Path(r"C:\Users\bdevoe\Desktop\Greenspace (GIS)\data")
YEAR = 2024
THRESH_NDVI = 0.65        # threshold for "percent of pixels above"
ALL_TOUCHED = True        # True = inclusive; False = pixel-center-in-polygon
USPS_STATES = ["CT", "MA", "ME", "NH", "RI", "VT"]  # New England
FIPS_BY_USPS = {"CT":"09","ME":"23","MA":"25","NH":"33","RI":"44","VT":"50"}

# Paths
RAW = BASE_DIR / "raw"
NDVI_DIR = RAW / "NDVI" / f"{YEAR}"
EVI_DIR  = RAW / "EVI"  / f"{YEAR}"
SHP_BASE = RAW / "CENSUS_SHP"
OUT_DIR  = BASE_DIR / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# -------------------- HELPERS --------------------
def find_geoid_column(gdf: gpd.GeoDataFrame) -> str:
    """Ensure a 'GEOID' string col exists; return its name."""
    for c in ["GEOID", "GEOID10", "GEOID20"]:
        if c in gdf.columns:
            gdf["GEOID"] = gdf[c].astype(str)
            return "GEOID"
    # Try to build from parts
    parts = [("STATEFP","STATEFP10","STATEFP20"), ("COUNTYFP","COUNTYFP10","COUNTYFP20"), ("TRACTCE","TRACTCE10","TRACTCE20")]
    cols = []
    for group in parts:
        for c in group:
            if c in gdf.columns:
                cols.append(c)
                break
    if len(cols) == 3:
        s,c,t = cols
        gdf["GEOID"] = (
            gdf[s].astype(str).str.zfill(2) +
            gdf[c].astype(str).str.zfill(3) +
            gdf[t].astype(str).str.zfill(6)
        )
        return "GEOID"
    raise ValueError("Could not find or construct GEOID in shapefile.")

def dtype_scale_from_raster(ds: rasterio.io.DatasetReader) -> int:
    """Scale factor to convert stored values to real. EE Int16 -> 10000 else 1."""
    return 10000 if ds.dtypes[0].startswith("int") else 1

def match_raster(dirpath: Path, usps: str, kind: str) -> Path:
    """
    Find a raster by pattern. kind in {'NDVI','EVI'}.
    Accepts files like: L57_Summer_NDVI_2010_CT.tif  /  L8_Summer_EVI_2024_MA.tif
    """
    pats = [fr"*_{kind}_{YEAR}_{usps}.tif", fr"*{kind}_{YEAR}_{usps}.tif"]
    for p in pats:
        matches = list(dirpath.glob(p))
        if matches:
            return matches[0]
    raise FileNotFoundError(f"No {kind} raster found for {usps} in {dirpath}")

def build_shp_path(usps: str) -> Path:
    fips = FIPS_BY_USPS[usps]
    shpdir = SHP_BASE / f"tl_{YEAR}_{fips}_tract10"
    shp = shpdir / f"tl_{YEAR}_{fips}_tract10.shp"
    if not shp.exists():
        raise FileNotFoundError(f"Missing shapefile for {usps} at {shp}")
    return shp

def pct_above_threshold_factory(thresh_scaled: float):
    """Return a function compatible with rasterstats(add_stats=...) computing % > threshold."""
    def pct_func(arr):
        a = np.asarray(arr, dtype="float64")
        # rasterstats passes NaN for nodata if nodata kwarg is provided
        valid = ~np.isnan(a)
        if valid.sum() == 0:
            return float("nan")
        return float((a[valid] > thresh_scaled).mean() * 100.0)  # percentage 0..100
    return pct_func

# -------------------- MAIN --------------------
all_frames = []

for usps in USPS_STATES:
    print(f"\n=== {usps} ===")
    shp = build_shp_path(usps)
    gdf = gpd.read_file(shp)
    geocol = find_geoid_column(gdf)

    # NDVI raster
    ndvi_path = match_raster(NDVI_DIR, usps, "NDVI")
    with rasterio.open(ndvi_path) as ds_nd:
        # Reproject tracts to raster CRS
        if gdf.crs != ds_nd.crs:
            gdf = gdf.to_crs(ds_nd.crs)
        nd_nodata = ds_nd.nodata
        nd_scale = dtype_scale_from_raster(ds_nd)
        nd_thresh_scaled = THRESH_NDVI * nd_scale

    # EVI raster
    evi_path = match_raster(EVI_DIR, usps, "EVI")
    with rasterio.open(evi_path) as ds_ev:
        if gdf.crs != ds_ev.crs:
            gdf = gdf.to_crs(ds_ev.crs)
        ev_nodata = ds_ev.nodata
        ev_scale = dtype_scale_from_raster(ds_ev)  # likely 10000 too

    # ---- Zonal stats: NDVI (mean + threshold %) ----
    nd_add = {"pct_ndvi_ge_065": pct_above_threshold_factory(nd_thresh_scaled)}
    nd_kwargs = dict(
        vectors=gdf.geometry,
        raster=str(ndvi_path),
        stats=["mean"],
        add_stats=nd_add,
        all_touched=ALL_TOUCHED,
        geojson_out=False,
        categorical=False,
    )
    if nd_nodata is not None:
        nd_kwargs["nodata"] = nd_nodata

    nd_stats = zonal_stats(**nd_kwargs)
    nd_df = pd.DataFrame(nd_stats)
    # Unscale mean
    nd_df["ndvi_mean"] = nd_df["mean"].astype(float) / nd_scale
    nd_df["ndvi_pct_above_065"] = nd_df["pct_ndvi_ge_065"]  # already 0..100
    nd_df = nd_df[["ndvi_mean", "ndvi_pct_above_065"]]

    # ---- Zonal stats: EVI (mean only) ----
    ev_kwargs = dict(
        vectors=gdf.geometry,
        raster=str(evi_path),
        stats=["mean"],
        all_touched=ALL_TOUCHED,
        geojson_out=False,
        categorical=False,
    )
    if ev_nodata is not None:
        ev_kwargs["nodata"] = ev_nodata

    ev_stats = zonal_stats(**ev_kwargs)
    ev_df = pd.DataFrame(ev_stats)
    ev_df["evi_mean"] = ev_df["mean"].astype(float) / ev_scale
    ev_df = ev_df[["evi_mean"]]

    # Combine
    out = pd.concat([gdf[[geocol]].rename(columns={geocol:"GEOID"}).reset_index(drop=True),
                     nd_df.reset_index(drop=True),
                     ev_df.reset_index(drop=True)], axis=1)
    out["state"] = usps
    out["year"] = YEAR

    # Save per-state CSV
    per_csv = OUT_DIR / f"TRACT_AGG_{YEAR}_{usps}.csv"
    out.to_csv(per_csv, index=False)
    print(f"Saved {len(out)} rows → {per_csv}")
    all_frames.append(out)

# ---- Save combined CSV ----
combined = pd.concat(all_frames, ignore_index=True)
combined_csv = OUT_DIR / f"TRACT_AGG_{YEAR}_NE_ALL.csv"
combined.to_csv(combined_csv, index=False)
print(f"\nCombined saved → {combined_csv}")

# ---- Quick diagnostic plots ----
# 1) Histogram of ndvi_mean
plt.figure()
combined["ndvi_mean"].plot.hist(bins=30)
plt.title(f"NDVI Mean per Tract ({YEAR}, New England)")
plt.xlabel("NDVI mean")
plt.ylabel("Count")
plt.tight_layout()
hist1 = OUT_DIR / f"hist_ndvi_mean_{YEAR}.png"
plt.savefig(hist1)
plt.close()

# 2) Histogram of % pixels with NDVI >= 0.65
plt.figure()
combined["ndvi_pct_above_065"].plot.hist(bins=30)
plt.title(f"% Pixels with NDVI ≥ 0.65 per Tract ({YEAR})")
plt.xlabel("Percent of pixels (0–100)")
plt.ylabel("Count")
plt.tight_layout()
hist2 = OUT_DIR / f"hist_ndvi_pct_ge_065_{YEAR}.png"
plt.savefig(hist2)
plt.close()

print(f"Plots saved → {hist1} and {hist2}")



=== CT ===


FileNotFoundError: Missing shapefile for CT at C:\Users\bdevoe\Desktop\Greenspace (GIS)\data\raw\CENSUS_SHP\tl_2024_09_tract10\tl_2024_09_tract10.shp

In [4]:


"""
Flexible aggregation of NDVI & EVI to Census tracts (New England)
v2 — robust raster matching + graceful skipping of missing states

- Independent control of RASTER_YEAR (NDVI/EVI) and TRACT_YEAR (tracts)
- Mean NDVI, mean EVI, % tract pixels with NDVI >= THRESH_NDVI
- Auto-unscale EE Int16 x10000
- Tolerates filename variations like "ND VI", "ND_VI", ".tiff"
- Skips states cleanly if rasters are missing, and prints a summary table
"""

from pathlib import Path
import re
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterstats import zonal_stats
import matplotlib.pyplot as plt

# -------------------- CONFIG --------------------
BASE_DIR    = Path(r"C:\Users\bdevoe\Desktop\Greenspace (GIS)\data")

RASTER_YEAR = 2024     # NDVI/EVI year
TRACT_YEAR  = 2010     # tract vintage (2010 or 2020)

THRESH_NDVI = 0.65
ALL_TOUCHED = True

USPS_STATES = ["CT", "MA", "ME", "NH", "RI", "VT"]
FIPS_BY_USPS = {"CT":"09","ME":"23","MA":"25","NH":"33","RI":"44","VT":"50"}

SKIP_MISSING = True  # True: continue if a state is missing a raster; False: raise

# -------------------- PATHS --------------------
RAW      = BASE_DIR / "raw"
NDVI_DIR = RAW / "NDVI" / f"{RASTER_YEAR}"
EVI_DIR  = RAW / "EVI"  / f"{RASTER_YEAR}"
SHP_BASE = RAW / "CENSUS_SHP"
OUT_DIR  = BASE_DIR / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# -------------------- HELPERS --------------------
def geoid_column(gdf: gpd.GeoDataFrame) -> str:
    for c in ["GEOID","GEOID10","GEOID20"]:
        if c in gdf.columns:
            gdf["GEOID"] = gdf[c].astype(str)
            return "GEOID"
    candidates = [
        ("STATEFP","COUNTYFP","TRACTCE"),
        ("STATEFP10","COUNTYFP10","TRACTCE10"),
        ("STATEFP20","COUNTYFP20","TRACTCE20"),
    ]
    for s,c,t in candidates:
        if all(col in gdf.columns for col in [s,c,t]):
            gdf["GEOID"] = (
                gdf[s].astype(str).str.zfill(2) +
                gdf[c].astype(str).str.zfill(3) +
                gdf[t].astype(str).str.zfill(6)
            )
            return "GEOID"
    raise ValueError("Could not find/construct GEOID in shapefile.")

def tract_suffix_from_year(year: int) -> str:
    return "tract20" if year >= 2020 else "tract10"

def tract_shapefile(usps: str) -> Path:
    fips = FIPS_BY_USPS[usps]
    suffix = tract_suffix_from_year(TRACT_YEAR)
    shp_dir = SHP_BASE / f"tl_{TRACT_YEAR}_{fips}_{suffix}"
    shp = shp_dir / f"tl_{TRACT_YEAR}_{fips}_{suffix}.shp"
    if not shp.exists():
        raise FileNotFoundError(f"Missing shapefile for {usps}: {shp}")
    return shp

def normalize_alpha(s: str) -> str:
    """Keep letters only and uppercase: 'L57_Summer_ND VI_2010_CT' -> 'LSUMMERNDVICT' (year removed later)."""
    return "".join(ch for ch in s.upper() if ch.isalpha())

def smart_find_raster(dirpath: Path, usps: str, kind: str, year: int) -> Path | None:
    """
    Find a raster matching (state, kind, year) even if the filename has odd separators.
    kind should be 'NDVI' or 'EVI'.
    Returns a Path or None if not found.
    """
    if not dirpath.exists():
        return None
    target_kind = normalize_alpha(kind)
    target_usps = usps.upper()
    target_year = str(year)

    # First pass: strict patterns
    strict_patterns = [
        f"*_{kind}_{year}_{usps}.tif",
        f"*{kind}_{year}_{usps}.tif",
        f"*_{kind}_{year}_{usps}.tiff",
        f"*{kind}_{year}_{usps}.tiff",
    ]
    for pat in strict_patterns:
        hits = list(dirpath.glob(pat))
        if hits:
            return hits[0]

    # Second pass: tolerant scan
    candidates = []
    for p in dirpath.glob("*.*"):
        name = p.name
        if target_year not in name:
            continue
        if target_usps not in name.upper():
            continue
        norm = normalize_alpha(name)  # letters only
        if target_kind in norm:
            candidates.append(p)

    if candidates:
        # Prefer shorter names (less likely to be wrong) and .tif over .tiff
        candidates.sort(key=lambda x: (len(x.name), 0 if x.suffix.lower()==".tif" else 1, x.name))
        return candidates[0]

    return None

def scale_from_dtype(ds: rasterio.io.DatasetReader) -> int:
    return 10000 if ds.dtypes[0].startswith("int") else 1

def pct_above_factory(thresh_scaled: float):
    def fn(arr):
        a = np.asarray(arr, dtype="float64")
        good = ~np.isnan(a)
        if good.sum() == 0:
            return float("nan")
        return float((a[good] > thresh_scaled).mean() * 100.0)
    return fn

# -------------------- MAIN --------------------
print(f"Aggregating RASTER_YEAR={RASTER_YEAR} vs TRACT_YEAR={TRACT_YEAR}")
summary_rows = []
frames = []

for usps in USPS_STATES:
    nd_path = smart_find_raster(NDVI_DIR, usps, "NDVI", RASTER_YEAR)
    ev_path = smart_find_raster(EVI_DIR,  usps, "EVI",  RASTER_YEAR)

    summary_rows.append({"state": usps, "ndvi_found": nd_path is not None, "evi_found": ev_path is not None})

    if nd_path is None or ev_path is None:
        msg = f"[WARN] {usps}: missing " + ("NDVI " if nd_path is None else "") + ("EVI" if ev_path is None else "")
        if SKIP_MISSING:
            print(msg + " → skipping.")
            continue
        else:
            raise FileNotFoundError(msg)

    print(f"\n=== {usps} ===\nNDVI: {nd_path.name}\nEVI : {ev_path.name}")

    # Load tracts
    tr = gpd.read_file(tract_shapefile(usps))
    geocol = geoid_column(tr)

    # NDVI meta
    with rasterio.open(nd_path) as ds_nd:
        if tr.crs != ds_nd.crs:
            tr = tr.to_crs(ds_nd.crs)
        nd_scale = scale_from_dtype(ds_nd)
        nd_nodata = ds_nd.nodata
    nd_thresh_scaled = THRESH_NDVI * nd_scale

    # NDVI zonal
    nd_stats = zonal_stats(
        vectors=tr.geometry,
        raster=str(nd_path),
        stats=["mean"],
        add_stats={"pct_ndvi_ge_065": pct_above_factory(nd_thresh_scaled)},
        all_touched=ALL_TOUCHED,
        geojson_out=False,
        categorical=False,
        nodata=nd_nodata,
    )
    nd = pd.DataFrame(nd_stats)
    nd["ndvi_mean"] = nd["mean"].astype(float) / nd_scale
    nd["ndvi_pct_above_065"] = nd["pct_ndvi_ge_065"]
    nd = nd[["ndvi_mean", "ndvi_pct_above_065"]]

    # EVI meta + zonal
    with rasterio.open(ev_path) as ds_ev:
        ev_scale  = scale_from_dtype(ds_ev)
        ev_nodata = ds_ev.nodata

    ev_stats = zonal_stats(
        vectors=tr.geometry,
        raster=str(ev_path),
        stats=["mean"],
        all_touched=ALL_TOUCHED,
        geojson_out=False,
        categorical=False,
        nodata=ev_nodata,
    )
    ev = pd.DataFrame(ev_stats)
    ev["evi_mean"] = ev["mean"].astype(float) / ev_scale
    ev = ev[["evi_mean"]]

    # Merge & save per-state
    out = pd.concat(
        [tr[[geocol]].rename(columns={geocol: "GEOID"}).reset_index(drop=True),
         nd.reset_index(drop=True),
         ev.reset_index(drop=True)],
        axis=1,
    )
    out["state"] = usps
    out["raster_year"] = RASTER_YEAR
    out["tract_year"]  = TRACT_YEAR

    csv_state = OUT_DIR / f"TRACT_AGG_R{RASTER_YEAR}_T{TRACT_YEAR}_{usps}.csv"
    out.to_csv(csv_state, index=False)
    print(f"Saved {len(out)} rows → {csv_state}")
    frames.append(out)

# Summary table
summary = pd.DataFrame(summary_rows)
sum_csv = OUT_DIR / f"RASTER_AVAILABILITY_R{RASTER_YEAR}.csv"
summary.to_csv(sum_csv, index=False)
print("\nRaster availability summary:")
print(summary.to_string(index=False))
print(f"Saved → {sum_csv}")

# Combined outputs
if frames:
    combo = pd.concat(frames, ignore_index=True)
    csv_all = OUT_DIR / f"TRACT_AGG_R{RASTER_YEAR}_T{TRACT_YEAR}_NE_ALL.csv"
    combo.to_csv(csv_all, index=False)
    print(f"\nCombined saved → {csv_all}")

    # Histograms
    plt.figure()
    combo["ndvi_mean"].plot.hist(bins=30)
    plt.title(f"NDVI Mean per Tract (rasters {RASTER_YEAR}, tracts {TRACT_YEAR})")
    plt.xlabel("NDVI mean"); plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(OUT_DIR / f"hist_ndvi_mean_R{RASTER_YEAR}_T{TRACT_YEAR}.png"); plt.close()

    plt.figure()
    combo["ndvi_pct_above_065"].plot.hist(bins=30)
    plt.title(f"% Pixels with NDVI ≥ 0.65 per Tract (rasters {RASTER_YEAR}, tracts {TRACT_YEAR})")
    plt.xlabel("Percent (0–100)"); plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(OUT_DIR / f"hist_ndvi_pct_ge_065_R{RASTER_YEAR}_T{TRACT_YEAR}.png"); plt.close()

    print("Plots saved.")
else:
    print("\nNo states processed (missing rasters?). Adjust years or filenames and rerun.")


Aggregating RASTER_YEAR=2024 vs TRACT_YEAR=2010

=== CT ===
NDVI: L8_Summer_NDVI_30m_2024_CT.tif
EVI : L8_Summer_EVI_30m_2024_CT.tif
Saved 833 rows → C:\Users\bdevoe\Desktop\Greenspace (GIS)\data\processed\TRACT_AGG_R2024_T2010_CT.csv

=== MA ===
NDVI: L8_Summer_NDVI_30m_2024_MA.tif
EVI : L8_Summer_EVI_30m_2024_MA.tif
Saved 1478 rows → C:\Users\bdevoe\Desktop\Greenspace (GIS)\data\processed\TRACT_AGG_R2024_T2010_MA.csv

=== ME ===
NDVI: L8_Summer_NDVI_30m_2024_CT.tif
EVI : L8_Summer_EVI_30m_2024_CT.tif
Saved 358 rows → C:\Users\bdevoe\Desktop\Greenspace (GIS)\data\processed\TRACT_AGG_R2024_T2010_ME.csv

=== NH ===
NDVI: L8_Summer_NDVI_30m_2024_NH.tif
EVI : L8_Summer_EVI_30m_2024_NH.tif
Saved 295 rows → C:\Users\bdevoe\Desktop\Greenspace (GIS)\data\processed\TRACT_AGG_R2024_T2010_NH.csv

=== RI ===
NDVI: L8_Summer_NDVI_30m_2024_RI.tif
EVI : L8_Summer_EVI_30m_2024_RI.tif
Saved 244 rows → C:\Users\bdevoe\Desktop\Greenspace (GIS)\data\processed\TRACT_AGG_R2024_T2010_RI.csv

=== VT ===
NDVI