
# GEOS-LDAS Land Variables: Annual Means & Monthly Climatologies

This notebook reads a collection of **GEOS-LDAS land (`lnd`) NetCDF** files, extracts selected variables, computes **annual means** and **monthly climatologies**, and writes them back out to NetCDF files.

**Notes**

- The variables are treated as **means over time** (not totals). For fluxes (e.g., `kg m-2 s-1`, `W m-2`), an *annual mean* is computed; if you need *annual totals*, integrate by time instead of a plain mean.
- The notebook masks large fill values (e.g., `>= 1e14`) as missing before computing statistics.
- Outputs are written as:
  - `annual_means.nc`: one file with a `year` dimension
  - `monthly_climatology.nc`: one file with a `month` dimension


In [None]:

# === User parameters ===

# Directory or glob for input LDAS lnd files (edit this)
INPUT_GLOB = "/discover/nobackup/projects/land_da/CYGNSS_Experiments/OLv8_M36_cd/OLv8_M36_cd/output/SMAP_EASEv2_M36_GLOBAL/cat/ens_avg/Y????/M??/OLv8_M36_cd.tavg24_1d_lnd*.*"  # e.g., "/discover/nobackup/.../lnd/Y*/M*/*.nc4"

# Output folder (created if it doesn't exist)
OUT_DIR = "/discover/nobackup/projects/land_da/CYGNSS_Experiments/OLv8_M36_cd/OLv8_M36_cd/output/SMAP_EASEv2_M36_GLOBAL/cat/ens_avg"

# Time subset (inclusive). Set to None to skip filtering.
START_DATE = "2018-08-01"   # or None
END_DATE   = "2024-06-30"   # or None

# Variables to extract and process
VARS = [
    "GRN", "LAI",
    "GWETPROF", "GWETROOT", "GWETTOP",
    "PRMC", "RZMC", "SFMC",
    "PRECTOTCORRLAND", "QINFILLAND",
    "SHLAND", "LHLAND", "EVLAND",
]

# Chunking for xarray/dask (None = let xarray decide; or provide dict like {"time": 64, "tile": 4096})
CHUNKS = {"time": 64, "tile": 32768}

# Compression settings for NetCDF (set zlib=True for smaller files)
ENCODING_COMP = {"zlib": True, "complevel": 4, "shuffle": True}


In [None]:

import os
from glob import glob
import numpy as np
import xarray as xr

os.makedirs(OUT_DIR, exist_ok=True)
len(glob(INPUT_GLOB)), INPUT_GLOB


In [None]:

files = sorted(glob(INPUT_GLOB))
if not files:
    raise FileNotFoundError(f"No input files found for pattern: {INPUT_GLOB}")

# Open multi-file dataset with only the requested variables (if present)
ds = xr.open_mfdataset(
    files,
    combine="by_coords",
    parallel=True,
    chunks=CHUNKS,
    decode_times=True,
    decode_cf=True,
)

# Keep only the variables of interest that are actually present
vars_present = [v for v in VARS if v in ds.data_vars]
if not vars_present:
    raise ValueError("None of the requested variables are present in the input files.")

ds = ds[vars_present]

# Optional time subset
if START_DATE is not None:
    ds = ds.sel(time=slice(START_DATE, None))
if END_DATE is not None:
    ds = ds.sel(time=slice(None, END_DATE))

# Mask ridiculous large fill values manually (in case decode didn't mask all)
for v in vars_present:
    da = ds[v]
    ds[v] = xr.where(np.isfinite(da) & (np.abs(da) < 1e14), da, np.nan)

ds


In [None]:

# Annual means (calendar-year means)
annual_means = ds.groupby("time.year").mean("time", skipna=True, keep_attrs=True)
annual_means = annual_means.rename({"year": "year"})
annual_means.attrs.update(ds.attrs)
annual_means


In [None]:

# Monthly climatology across all available years
monthly_climo = ds.groupby("time.month").mean("time", skipna=True, keep_attrs=True)
monthly_climo = monthly_climo.rename({"month": "month"})
monthly_climo.attrs.update(ds.attrs)
monthly_climo


In [None]:
# === Grand mean & std across years (from annual_means) ===
grand_mean = annual_means.mean(dim="year", skipna=True, keep_attrs=True)
grand_std  = annual_means.std(dim="year", skipna=True, ddof=1, keep_attrs=True)  # sample std

# Combine into one Dataset with clear suffixes
gm = xr.Dataset()
for v in annual_means.data_vars:
    gm[f"{v}_mean"] = grand_mean[v]
    gm[f"{v}_std"]  = grand_std[v]

# carry over attributes
gm.attrs.update(annual_means.attrs)

In [None]:

def build_encoding(ds_like):
    enc = {}
    for v in ds_like.data_vars:
        enc[v] = dict(ENCODING_COMP)
        enc[v]["dtype"] = "float32"
        enc[v]["_FillValue"] = np.float32(1.0e15)
    return enc

annual_path = os.path.join(OUT_DIR, "annual_means.nc")
climo_path  = os.path.join(OUT_DIR, "monthly_climatology.nc")
grand_path = os.path.join(OUT_DIR, "grand_mean.nc")

annual_means.astype("float32").to_netcdf(
    annual_path,
    format="NETCDF4",
    encoding=build_encoding(annual_means),
)
monthly_climo.astype("float32").to_netcdf(
    climo_path,
    format="NETCDF4",
    encoding=build_encoding(monthly_climo),
)

gm.astype("float32").to_netcdf(
    grand_path,
    format="NETCDF4",
    encoding=build_encoding(grand_mean),
)
annual_path, climo_path, grand_path



## (Optional) Annual totals for flux variables

If you need **annual totals** instead of **annual means** for flux variables
(e.g., `kg m-2 s-1` or `W m-2`), integrate over time with actual time-step lengths, e.g.:


In [None]:

# Example (commented):
# import numpy as np
# flux_vars = ["PRECTOTCORRLAND", "QINFILLAND", "EVLAND", "SHLAND", "LHLAND"]
# sec_per_timestep = np.diff(ds.time.values).astype("timedelta64[s]").astype(int)
# sec_per_timestep = xr.DataArray(np.append(sec_per_timestep, sec_per_timestep[-1]),
#                                 dims=["time"], coords={"time": ds.time})
# tot = {v: (ds[v]*sec_per_timestep).groupby("time.year").sum("time", skipna=True)
#        for v in flux_vars if v in ds}
# annual_totals = xr.Dataset(tot)
# annual_totals.astype("float32").to_netcdf(os.path.join(OUT_DIR, "annual_totals_fluxes.nc"),
#                                           format="NETCDF4",
#                                           encoding=build_encoding(annual_totals))
# annual_totals
