In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import datetime as dt

# ---------- paths ----------
ol_path = "/Users/amfox/Desktop/GEOSldas_diagnostics/test_data/land_sweeper/LS_OLv8_M36/output/SMAP_EASEv2_M36_GLOBAL/cat/OLv8_land_variables_2000_2024_compressed.nc"
da_path = "/Users/amfox/Desktop/GEOSldas_diagnostics/test_data/M21C_land_sweeper/LS_DAv8_M36_v2/LS_DAv8_M36/output/SMAP_EASEv2_M36_GLOBAL/cat/DAv8_land_variables_2000_2024_compressed.nc"

# ---------- load ----------
ds_ol = xr.decode_cf(xr.open_dataset(ol_path))
ds_da = xr.decode_cf(xr.open_dataset(da_path))

# ---------- align months robustly with PeriodIndex ----------
idx_ol = pd.to_datetime(ds_ol['time'].values).to_period('M')
idx_da = pd.to_datetime(ds_da['time'].values).to_period('M')

# optional: deduplicate months per side (keep first month occurrence)
_, i_ol = np.unique(idx_ol.astype(str), return_index=True)
_, i_da = np.unique(idx_da.astype(str), return_index=True)
ds_ol = ds_ol.isel(time=np.sort(i_ol))
ds_da = ds_da.isel(time=np.sort(i_da))
idx_ol = pd.to_datetime(ds_ol['time'].values).to_period('M')
idx_da = pd.to_datetime(ds_da['time'].values).to_period('M')

# intersect months
commonM = np.intersect1d(idx_ol.astype(str), idx_da.astype(str))
assert commonM.size > 0, "No overlapping months between OL and DA."

m_ol = np.isin(idx_ol.astype(str), commonM)
m_da = np.isin(idx_da.astype(str), commonM)

ol = ds_ol['SFMC'].isel(time=np.where(m_ol)[0]).astype('float64')
da = ds_da['SFMC'].isel(time=np.where(m_da)[0]).astype('float64')

# assign identical end-of-month timestamps
common_ts = pd.PeriodIndex(commonM, freq='M').to_timestamp('M')
ol = ol.assign_coords(time=common_ts)
da = da.assign_coords(time=common_ts)

# ---------- optional: restrict to cells finite in both across all months ----------
common_mask = np.isfinite(ol).all('time') & np.isfinite(da).all('time')
ol = ol.where(common_mask)
da = da.where(common_mask)

# ---------- fixed climatology baseline (2001–2020) ----------
clim_base = slice('2001-01-01','2020-12-31')
ol_clim = ol.sel(time=clim_base).groupby('time.month').mean('time', skipna=True)
da_clim = da.sel(time=clim_base).groupby('time.month').mean('time', skipna=True)

# anomalies
ol_anom = (ol.groupby('time.month') - ol_clim)
da_anom = (da.groupby('time.month') - da_clim)

# ---------- metrics ----------
# mean difference
mean_diff = (da.mean('time') - ol.mean('time')).astype('float64')
mean_diff.name = "mean_diff"
mean_diff.attrs.update(long_name="DA minus OL mean SFMC", units="m3 m-3")

# guard against zero std
std_ol = ol_anom.std('time')
std_da = da_anom.std('time')
safe_std = (std_ol > 0) & (std_da > 0)

# variance ratio
vr = (std_da / std_ol).where(safe_std)
vr.name = "variance_ratio"
vr.attrs.update(long_name="Std(DA_anom) / Std(OL_anom)", units="1")

# percent change in amplitude
pct_amp_change = ((vr - 1.0) * 100.0).where(np.isfinite(vr))
pct_amp_change.name = "pct_amp_change"
pct_amp_change.attrs.update(long_name="Percent change in anomaly std (DA vs OL)", units="%")

# anomaly correlation
cov = (da_anom * ol_anom).mean('time')
den = np.sqrt((da_anom**2).mean('time') * (ol_anom**2).mean('time'))
anomR = (cov / den).where(np.isfinite(den) & (den>0))
anomR.name = "anom_correlation"
anomR.attrs.update(long_name="Anomaly correlation (DA vs OL)", units="1")

# ubRMSE on anomalies
ubRMSE = np.sqrt(((da_anom - ol_anom)**2).mean('time'))
ubRMSE = ubRMSE.where(np.isfinite(ubRMSE))
ubRMSE.name = "ubRMSE_anom"
ubRMSE.attrs.update(long_name="Unbiased RMSE of anomalies (DA vs OL)", units="m3 m-3")

# normalized absolute difference
nad = (np.abs(da - ol).mean('time') / std_ol).where(std_ol > 0)
nad.name = "normalized_abs_diff"
nad.attrs.update(long_name="Mean |DA-OL| normalized by Std(OL_anom)", units="1")

# trend difference (simple OLS slope in m3/m3 per year)
t_years = (ol['time'].dt.year + (ol['time'].dt.month - 0.5)/12).astype('float64')

def slope_along_time(a, t=t_years):
    t0 = t - t.mean().item()
    num = (a * t0).sum('time')
    den = (t0**2).sum('time')
    return num / den

beta_ol = slope_along_time(ol)
beta_da = slope_along_time(da)
dtrend  = (beta_da - beta_ol)
dtrend.name = "delta_trend"
dtrend.attrs.update(long_name="Trend difference: slope_DA - slope_OL", units="m3 m-3 yr-1")

# lag-1 autocorrelation difference on anomalies
def lag1_acorr(a):
    a0 = a.isel(time=slice(0, -1))
    a1 = a.isel(time=slice(1, None))
    num = (a0 * a1).mean('time')
    den = np.sqrt((a0**2).mean('time') * (a1**2).mean('time'))
    return num / den

rho1_ol = lag1_acorr(ol_anom)
rho1_da = lag1_acorr(da_anom)
drho1 = (rho1_da - rho1_ol)
drho1.name = "delta_rho1"
drho1.attrs.update(long_name="Lag-1 autocorrelation difference (DA-OL) on anomalies", units="1")

# ---------- lat/lon coords if present ----------
lat = ds_ol.coords.get('lat', None)
lon = ds_ol.coords.get('lon', None)
coords = {}
if lat is not None and lon is not None:
    coords["lat"] = (("y","x"), lat.values, {"standard_name":"latitude","units":"degrees_north"})
    coords["lon"] = (("y","x"), lon.values, {"standard_name":"longitude","units":"degrees_east"})

import datetime as dt

# Figure out dims (tile vs y,x)
dims = mean_diff.dims  # expect ('tile',) here

coords = {}
if 'tile' in dims and 'lat' in ds_ol.coords and 'lon' in ds_ol.coords:
    # 1-D coords over 'tile'
    coords['lat'] = ('tile', ds_ol['lat'].values,
                     {'standard_name':'latitude','units':'degrees_north'})
    coords['lon'] = ('tile', ds_ol['lon'].values,
                     {'standard_name':'longitude','units':'degrees_east'})

# Build the output dataset on the *tile* dimension
ds_metrics = xr.Dataset(
    data_vars={
        "mean_diff":            mean_diff.astype("float32"),
        "variance_ratio":       vr.astype("float32"),
        "pct_amp_change":       (((vr - 1.0) * 100.0).where(np.isfinite(vr))).astype("float32"),
        "anom_correlation":     anomR.astype("float32"),
        "ubRMSE_anom":          ubRMSE.astype("float32"),
        "normalized_abs_diff":  nad.astype("float32"),
        "delta_trend":          dtrend.astype("float32"),
        "delta_rho1":           drho1.astype("float32"),
    },
    coords=coords,
    attrs={
        "title": "DA vs OL monthly SFMC comparison (tile space, M36 land tiles)",
        "source": "GEOS-LDAS OLv8 and DAv8 monthly SFMC (time x tile)",
        "climatology": "Anomalies relative to each run’s 2001–2020 monthly climatology",
        "created_utc": f"{dt.datetime.utcnow():%Y-%m-%dT%H:%MZ}",
        "conventions": "CF-1.8",
        "notes": "Metrics computed per tile; lat/lon are 1-D tile coordinates.",
    },
)

# Compression
enc = {v: {"zlib": True, "complevel": 4, "_FillValue": np.float32(np.nan)}
       for v in ds_metrics.data_vars}
if "lat" in ds_metrics.coords: enc["lat"] = {"zlib": True, "complevel": 4}
if "lon" in ds_metrics.coords: enc["lon"] = {"zlib": True, "complevel": 4}

out_nc = "DA_vs_OL_SFMC_metrics_TILE.nc"
ds_metrics.to_netcdf(out_nc, format="NETCDF4", encoding=enc)
print(f"Wrote {out_nc} (tile-space)")


In [None]:
# ======== HUMAN-READABLE SUMMARY + HOTSPOTS (prints + CSVs) ========
import numpy as np
import pandas as pd

def _flatten_with_coords(da):
    """Return 1D arrays of values, lat, lon from either (tile) or (y,x) DataArray."""
    if ("y" in da.dims) and ("x" in da.dims):
        vals = da.values.ravel()
        latv = ds_metrics["lat"].values.ravel() if "lat" in ds_metrics.coords else np.full_like(vals, np.nan, dtype=float)
        lonv = ds_metrics["lon"].values.ravel() if "lon" in ds_metrics.coords else np.full_like(vals, np.nan, dtype=float)
    elif ("tile" in da.dims):
        vals = da.values
        latv = ds_metrics["lat"].values if "lat" in ds_metrics.coords else np.full_like(vals, np.nan, dtype=float)
        lonv = ds_metrics["lon"].values if "lon" in ds_metrics.coords else np.full_like(vals, np.nan, dtype=float)
    else:
        raise ValueError("Unexpected dims; need (y,x) or (tile).")
    return vals, latv, lonv

def _nanmean(a): 
    return float(np.nanmean(a)) if np.size(a) else np.nan
def _nanmedian(a): 
    return float(np.nanmedian(a)) if np.size(a) else np.nan
def _nanpct(a, cond):
    a = np.asarray(a); cond = np.asarray(cond)
    ok = np.isfinite(a) & np.isfinite(cond)
    if ok.any():
        return float(100.0 * cond[ok].mean())
    return np.nan

# Pull arrays for summary
md   = ds_metrics["mean_diff"]
vr   = ds_metrics["variance_ratio"]
pct  = ds_metrics["pct_amp_change"]
ar   = ds_metrics["anom_correlation"]
ub   = ds_metrics["ubRMSE_anom"]
nadv = ds_metrics["normalized_abs_diff"]
dtr  = ds_metrics["delta_trend"]
dr1  = ds_metrics["delta_rho1"]

# Flatten for simple stats
md_v, lat_v, lon_v = _flatten_with_coords(md)
vr_v, _, _         = _flatten_with_coords(vr)
pct_v, _, _        = _flatten_with_coords(pct)
ar_v, _, _         = _flatten_with_coords(ar)
ub_v, _, _         = _flatten_with_coords(ub)
nad_v, _, _        = _flatten_with_coords(nadv)
dtr_v, _, _        = _flatten_with_coords(dtr)
dr1_v, _, _        = _flatten_with_coords(dr1)

# Thresholds you can tweak
eps_mean   = 0.005      # m3/m3 "no-meaningful-mean-change"
vr_tol     = 0.10       # ±10% amplitude change
r_good     = 0.90       # anomaly correlation good
ub_lim     = 0.02       # m3/m3 ubRMSE "small"
trend_tol  = 5e-4       # m3/m3/yr
rho1_tol   = 0.05       # lag-1 change

summary = {
    "mean(|DA-OL|) [m3/m3]":        _nanmean(np.abs(md_v)),
    "median(|DA-OL|) [m3/m3]":      _nanmedian(np.abs(md_v)),
    "mean(variance_ratio)":         _nanmean(vr_v),
    "median(variance_ratio)":       _nanmedian(vr_v),
    "mean(pct_amp_change) [%]":     _nanmean(pct_v),
    "median(pct_amp_change) [%]":   _nanmedian(pct_v),
    "mean(anom_correlation)":       _nanmean(ar_v),
    "median(anom_correlation)":     _nanmedian(ar_v),
    "mean(ubRMSE_anom) [m3/m3]":    _nanmean(ub_v),
    "mean(NAD)":                    _nanmean(nad_v),
    "mean(delta_trend) [m3/m3/yr]": _nanmean(dtr_v),
    "mean(delta_rho1)":             _nanmean(dr1_v),
    "% |mean_diff| <= 0.005":       _nanpct(md_v, np.abs(md_v) <= eps_mean),
    "% |VR-1| <= 0.1":              _nanpct(vr_v, np.abs(vr_v - 1.0) <= vr_tol),
    "% anomR >= 0.9":               _nanpct(ar_v, ar_v >= r_good),
    "% ubRMSE < 0.02":              _nanpct(ub_v, ub_v < ub_lim),
    "% |Δtrend| <= 5e-4":           _nanpct(dtr_v, np.abs(dtr_v) <= trend_tol),
    "% |Δrho1| <= 0.05":            _nanpct(dr1_v, np.abs(dr1_v) <= rho1_tol),
}

# Print a neat text summary
print("\n=== DA vs OL SFMC: Text Summary ===")
for k, v in summary.items():
    print(f"{k:>30s}: {v:.4g}")

# Build a small hotspots table (top changes)
hotspots = []
# Top |mean diff|
idx_md = np.argsort(-np.abs(md_v))[:20]
for i in idx_md:
    hotspots.append(("abs_mean_diff", float(md_v[i]), float(vr_v[i]), float(ar_v[i]),
                     float(ub_v[i]), float(pct_v[i]), float(dtr_v[i]), float(dr1_v[i]),
                     float(lat_v[i]), float(lon_v[i])))

# Worst anomaly correlation
idx_ar = np.argsort(ar_v)[:20]
for i in idx_ar:
    hotspots.append(("low_anomR", float(md_v[i]), float(vr_v[i]), float(ar_v[i]),
                     float(ub_v[i]), float(pct_v[i]), float(dtr_v[i]), float(dr1_v[i]),
                     float(lat_v[i]), float(lon_v[i])))

# Largest % amplitude change
idx_pct = np.argsort(-np.abs(pct_v))[:20]
for i in idx_pct:
    hotspots.append(("large_pct_amp_change", float(md_v[i]), float(vr_v[i]), float(ar_v[i]),
                     float(ub_v[i]), float(pct_v[i]), float(dtr_v[i]), float(dr1_v[i]),
                     float(lat_v[i]), float(lon_v[i])))

hot_df = pd.DataFrame(hotspots, columns=[
    "reason","mean_diff","variance_ratio","anom_correlation","ubRMSE_anom",
    "pct_amp_change","delta_trend","delta_rho1","lat","lon"
])

# Save CSVs
pd.DataFrame([summary]).to_csv("DA_vs_OL_SFMC_global_summary.csv", index=False)
hot_df.to_csv("DA_vs_OL_SFMC_hotspots.csv", index=False)

print("\nWrote:")
print(" - DA_vs_OL_SFMC_global_summary.csv  (one-line global metrics)")
print(" - DA_vs_OL_SFMC_hotspots.csv        (top changes with lat/lon)")
print("Done.")


In [None]:
import struct

def read_tilecoord(fname):
    """Read GEOS-LDAS tilecoord Fortran binary (little-endian)."""
    int_precision = 'i'
    float_precision = 'f'
    machfmt = '<'
    tile_coord = {}
    with open(fname, 'rb') as ifp:
        _ = struct.unpack(f'{machfmt}i', ifp.read(4))[0]
        tile_coord['N_tile'] = struct.unpack(f'{machfmt}i', ifp.read(4))[0]
        _ = struct.unpack(f'{machfmt}i', ifp.read(4))[0]
        Nt = tile_coord['N_tile']
        fields = ['tile_id','typ','pfaf','com_lon','com_lat','min_lon','max_lon',
                  'min_lat','max_lat','i_indg','j_indg','frac_cell','frac_pfaf',
                  'area','elev']
        for field in fields:
            _ = struct.unpack(f'{machfmt}i', ifp.read(4))[0]
            dtype = int_precision if field in ['tile_id','typ','pfaf','i_indg','j_indg'] else float_precision
            arr = np.frombuffer(ifp.read(Nt*4), dtype=f'{machfmt}{dtype}')
            arr = arr.astype(np.float64 if dtype=='f' else np.int32)
            tile_coord[field] = arr
            _ = struct.unpack(f'{machfmt}i', ifp.read(4))[0]
    return tile_coord


In [None]:
from geospatial_plotting import plot_region, REGION_BOUNDS

ftc = '/Users/amfox/Desktop/GEOSldas_diagnostics/test_data/land_sweeper/LS_OLv8_M36/output/SMAP_EASEv2_M36_GLOBAL/rc_out/LS_OLv8_M36.ldas_tilecoord.bin'
tc = read_tilecoord(ftc)
print(f"N_tile = {tc['N_tile']}")

n_tile = tc['N_tile']
lat = tc['com_lat']
lon = tc['com_lon']

map_array = np.empty([n_tile, 3])
map_array.fill(np.nan)
map_array[:, 1] = lon
map_array[:, 2] = lat

# Keep your existing per-group plot behavior
map_array[:, 0] = ds_metrics["anom_correlation"].values

maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

# Plot group map
fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=(f'DAv8_M36 - OLv8_M36\n'
                f'(Max: {maxval:.3g} Min: {minval:.3g})'),
    units='Normalized Abs Diff',
    cmin=0.2,
    cmax=1.0,

)
fig.tight_layout()