
# Aridity Indices from GEOS‑LDAS Daily Output (FAO‑56 Penman–Monteith)

This notebook computes annual **UNEP Aridity Index (AI = P / PET)**, **Budyko dryness index (φ = PET / P)**,
**Climatic Moisture Index (CMI = (P − PET)/PET)**, and optional **De Martonne** index from
GEOS‑LDAS daily tile outputs (tavg24). It:
1. Walks a `Y####/M##` directory tree for a specified date range
2. Computes **reference PET (ET₀)** via FAO‑56 Penman–Monteith using daily means
3. Aggregates to **monthly** and **annual** totals
4. Writes results to a single NetCDF file

> Variables expected in the daily files: `Tair`, `Qair`, `Wind`, `RefH`, `Psurf`, `SWdown`, `LWdown`, `HLWUP`, `RainfSnowf`.


In [None]:

# === Parameters (edit these) ===

from pathlib import Path
import pandas as pd

# Root of your Y####/M## directories:
BASE = Path("/discover/nobackup/projects/land_da/CYGNSS_Experiments/OLv8_M36_cd/OLv8_M36_cd/output/SMAP_EASEv2_M36_GLOBAL/cat/ens_avg")

# Date range
START = pd.Timestamp("2018-08-01")
END   = pd.Timestamp("2024-06-30")

# File match inside each month directory
LFS_FILE_GLOB = "OLv8_M36_cd.tavg24_1d_lfs*.*"   # broad match to include .nc/.nc4 variants
LND_FILE_GLOB = "OLv8_M36_cd.tavg24_1d_lnd*.*" # broad match to include .nc/.nc4 variants

# Chunking (adjust for your machine/memory)
CHUNKS = {"time": 64, "tile": 10000}

# Mask thresholds & constants
EPS_MM = 1e-3    # avoid /0 in annual PET (mm)
ALPHA_REF = 0.23 # FAO-56 reference albedo (grass)

USE_MODEL_NET = True  # if True, use model net radiation; if False, use model shortwave + longwave

if USE_MODEL_NET:
    OUTFILE = BASE / f"aridity_indices_model_net_rad_{START:%Y%m%d}_{END:%Y%m%d}.nc4"
else:
    OUTFILE = BASE / f"aridity_indices_{START:%Y%m%d}_{END:%Y%m%d}.nc4"

print(BASE, START, END, OUTFILE, sep="\n")


In [None]:

# === Imports & helpers ===
import re
import os
import numpy as np
import xarray as xr
import pandas as pd
import dask

DATE_RE = re.compile(r"\.(\d{8})_")   # captures YYYYMMDD in filename pieces like .20221222_

FV = 1e15
def clean(ds, name):
    """Mask out large fill values common in GEOS-LDAS outputs."""
    v = ds[name]
    return v.where(v < FV)

def collect_files(base: Path, start: pd.Timestamp, end: pd.Timestamp, file_glob: str):
    files = []
    for ydir in sorted(base.glob("Y*/")):
        for mdir in sorted(ydir.glob("M*/")):
            for f in mdir.glob(file_glob):
                m = DATE_RE.search(f.name)
                if not m:
                    continue
                dt = pd.to_datetime(m.group(1), format="%Y%m%d")
                if start <= dt <= end:
                    files.append(str(f))
    files.sort()
    return files


In [None]:

# === Collect files in the date range ===
lfs_files = collect_files(BASE, START, END, LFS_FILE_GLOB)
print(f"Found {len(lfs_files)} daily files between {START.date()} and {END.date()}.")
if len(lfs_files) == 0:
    raise SystemExit("No files found. Check BASE/START/END/FILE_GLOB.")
lfs_files[:5]

lnd_files = collect_files(BASE, START, END, LND_FILE_GLOB)
print(f"Found {len(lnd_files)} daily files between {START.date()} and {END.date()}.")
if len(lnd_files) == 0:
    raise SystemExit("No files found. Check BASE/START/END/FILE_GLOB.")
lnd_files[:5]
if len(lnd_files) != len(lfs_files):
    raise SystemExit("LFS and LND file counts do not match. Check your FILE_GLOB patterns.")


In [None]:
# === Open dataset with dask chunking ===
ds = xr.open_mfdataset(
    lfs_files, combine="nested", parallel=True,concat_dim="time",
    decode_times=True, chunks=CHUNKS, engine="netcdf4"
)

# Quick peek
ds

ds_lnd = xr.open_mfdataset(
    lnd_files, combine="nested", parallel=True,concat_dim="time",
    decode_times=True, chunks=CHUNKS, engine="netcdf4"
)

# Quick peek
ds_lnd


In [None]:
# Grab the first filename
fname0 = os.path.basename(lfs_files[0])

# Regex to capture the 8-digit date before the "_1200z"
m = re.search(r"\.(\d{8})_1200z", fname0)
if not m:
    raise ValueError(f"No YYYYMMDD date found in {fname0}")
start_str = m.group(1)

# Convert to pandas Timestamp / numpy datetime64
start_date = pd.to_datetime(start_str, format="%Y%m%d")

# Build daily sequence, length = number of files
all_dates = start_date + pd.to_timedelta(np.arange(len(lfs_files)), unit="D")

# Assign to dataset (as datetime64[ns])
ds = ds.assign_coords(time=("time", all_dates.values)).sortby("time")

ds_lnd = ds_lnd.assign_coords(time=("time", all_dates.values)).sortby("time")

# Verify time coordinate
ds.time
ds_lnd.time

In [None]:

# === Build daily PET (FAO-56 Penman–Monteith) and daily Precip ===

# Inputs (daily means)
Tair = clean(ds, "Tair") - 273.15         # °C
qair = clean(ds, "Qair")                   # kg/kg
wind_z = clean(ds, "Wind")                 # m/s at RefH
zref = clean(ds, "RefH").fillna(2.0)       # m
ps_kpa = (clean(ds, "Psurf") / 1000.0)     # Pa -> kPa
sw_down = clean(ds, "SWdown")              # W/m2
lw_down = clean(ds, "LWdown")              # W/m2
lw_up   = clean(ds, "HLWUP")               # W/m2
P_rate  = clean(ds, "RainfSnowf")          # kg m-2 s-1 (== mm s-1)

# Inputs from LND file
greeness = clean(ds_lnd, "GRN")   # unitless [0,1]
lai = clean(ds_lnd, "LAI")        # m2/m2
net_sw = clean(ds_lnd, "SWLAND")  # W/m2
net_lw = clean(ds_lnd, "LWLAND")  # W/m2

if USE_MODEL_NET:
    # Use model net radiation directly (W/m2)
    Rn_W = net_sw + net_lw
else:
    # Net radiation (MJ m-2 day-1). Assumes LWdown is downward and HLWUP is upward emitted.
    Rn_W = (1.0 - ALPHA_REF) * sw_down + lw_down - lw_up
    
Rn_MJ = (Rn_W * 86400.0) / 1e6

# Thermodynamics and VPD
es = 0.6108 * np.exp(17.27 * Tair / (Tair + 237.3))      # kPa
ea = (qair * ps_kpa) / (0.622 + 0.378 * qair)            # kPa
vpd = (es - ea).clip(min=0.0)                            # kPa

delta = 4098.0 * es / (Tair + 237.3) ** 2                # kPa / °C
gamma = 0.000665 * ps_kpa                                 # kPa / °C

# Wind at 2 m (FAO-56)
z = xr.where(zref > 0.5, zref, 2.0)
u2 = wind_z * (4.87 / np.log(67.8 * z - 5.42))

# FAO-56 PM (mm/day), ground heat flux ~ 0 for daily
ET0_day = (
    0.408 * delta * (Rn_MJ) +
    gamma * (900.0 / (Tair + 273.0)) * u2 * vpd
) / (delta + gamma * (1.0 + 0.34 * u2))

PET_day = ET0_day.clip(min=0.0)            # mm/day
P_day   = (P_rate * 86400.0).clip(min=0.0) # mm/day

PET_day, P_day


In [None]:
print(P_day.dims)
print(P_day["time"].dtype, P_day["time"].values[:3])


In [None]:

# === Aggregate to monthly and annual totals, then compute indices ===

# Monthly totals
P_mon   = P_day.resample(time="MS").sum()
PET_mon = PET_day.resample(time="MS").sum()

# Annual (calendar year) totals
P_ann   = P_mon.resample(time="YS").sum()
PET_ann = PET_mon.resample(time="YS").sum()

# Indices (annual)
AI_ann  = (P_ann / PET_ann).where(PET_ann > EPS_MM).rename("AI")
PHI_ann = (PET_ann / P_ann).where(P_ann > EPS_MM).rename("Budyko_phi")
CMI_ann = ((P_ann - PET_ann) / PET_ann.where(PET_ann > EPS_MM)).rename("CMI")

# Climatologies (mean over available years)
AI_clim  = AI_ann.mean("time").rename("AI_clim")
PHI_clim = PHI_ann.mean("time").rename("Budyko_phi_clim")
CMI_clim = CMI_ann.mean("time").rename("CMI_clim")

# Optional De Martonne
T_ann = (clean(ds, "Tair") - 273.15).resample(time="YS").mean()
IDM_ann  = (P_ann / (T_ann + 10.0)).rename("DeMartonne")
IDM_clim = IDM_ann.mean("time").rename("DeMartonne_clim")

# Annual greeness and LAI (preserve annual time dimension)
greeness_ann = greeness.resample(time="YS").mean()
lai_ann = lai.resample(time="YS").mean()
max_lai_ann = lai.resample(time="YS").max()

# Climatological mean of annual greeness and LAI (mean over years -> dims: tile)
mean_greeness_clim = greeness_ann.mean(dim="time").rename("mean_greeness_clim")
mean_lai_clim = lai_ann.mean(dim="time").rename("mean_lai_clim")
max_lai_clim = max_lai_ann.mean(dim="time").rename("max_lai_clim")

AI_ann, PHI_ann, CMI_ann


In [None]:
# (after you computed AI_ann, PHI_ann, CMI_ann, IDM_ann, *_clim, P_ann, PET_ann)

# persist computed arrays
AI_ann, PHI_ann, CMI_ann, IDM_ann, AI_clim, PHI_clim, CMI_clim, IDM_clim, P_ann, PET_ann, mean_greeness_clim, mean_lai_clim, max_lai_clim = \
    dask.persist(AI_ann, PHI_ann, CMI_ann, IDM_ann, AI_clim, PHI_clim, CMI_clim, IDM_clim, P_ann, PET_ann, mean_greeness_clim, mean_lai_clim, max_lai_clim)

In [None]:

# === Build output dataset and save ===
out = xr.Dataset(
    data_vars=dict(
        AI=AI_ann, Budyko_phi=PHI_ann, CMI=CMI_ann,
        DeMartonne=IDM_ann,
        AI_clim=AI_clim, Budyko_phi_clim=PHI_clim, CMI_clim=CMI_clim,
        DeMartonne_clim=IDM_clim,
        P_annual=P_ann.rename("P_annual"),
        PET_annual=PET_ann.rename("PET_annual"),
        mean_greeness_clim=mean_greeness_clim,
        mean_lai_clim=mean_lai_clim,
        max_lai_clim=max_lai_clim
    ),
    coords=dict(
        time=AI_ann.time,
        tile=ds["tile"] if "tile" in ds.coords else np.arange(ds.dims["tile"]),
        lat=ds["lat"], lon=ds["lon"]
    ),
    attrs=dict(
        description="Aridity indices from GEOS-LDAS daily tile output using FAO-56 Penman–Monteith",
        period_start=str(START.date()), period_end=str(END.date()),
        albedo_reference=float(ALPHA_REF), pet_method="FAO-56 Penman–Monteith",
        precip_var="RainfSnowf", note="Annual sums/means over resampled calendar years."
    )
).assign_coords(year=("time", AI_ann["time.year"].values))

encoding = {v: {"zlib": True, "complevel": 3} for v in out.data_vars}
out.to_netcdf(OUTFILE, encoding=encoding)
print("Wrote:", OUTFILE)
out


In [None]:

# === Quick checks (optional) ===

# 1) Global mean AI by year (excludes tiles with NaNs)
ai_mean = out["AI"].mean(dim="tile", skipna=True)
phi_mean = out["Budyko_phi"].mean(dim="tile", skipna=True)
cmi_mean = out["CMI"].mean(dim="tile", skipna=True)
display(ai_mean.to_dataframe(name="AI_mean").head())
display(phi_mean.to_dataframe(name="phi_mean").head())
display(cmi_mean.to_dataframe(name="CMI_mean").head())

# 2) Count valid tiles per year
valid_tiles = out["AI"].notnull().sum(dim="tile")
display(valid_tiles.to_dataframe(name="valid_tiles").head())
