In [None]:
import xarray as xr
import numpy as np
import os
from datetime import datetime

import sys;       sys.path.append('../util/shared/python/')
from read_GEOSldas          import read_tilecoord

from geospatial_plotting import plot_region, REGION_BOUNDS

In [None]:
def build_lsm_dataset(root_dir, file_prefix, varnames, start_year=2000, end_year=2024):
    all_files = []
    all_dates = []

    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            filename = f"{file_prefix}.tavg24_1d_lnd_Nt.monthly.{year:04d}{month:02d}.nc4"
            fpath = os.path.join(
                root_dir,
                f"Y{year:04d}",
                f"M{month:02d}",
                filename
            )
            if os.path.exists(fpath):
                all_files.append(fpath)
                all_dates.append(np.datetime64(f"{year:04d}-{month:02d}-01"))

    print(f"Found {len(all_files)} files.")

    # Load static lat/lon from first file
    with xr.open_dataset(all_files[0]) as ds_static:
        lat = ds_static["lat"]
        lon = ds_static["lon"]

    # Define preprocess to extract just the desired variables
    def _preprocess(ds):
        return ds[[v for v in varnames if v in ds.variables]]

    # Load all time-varying data
    ds = xr.open_mfdataset(
        all_files,
        combine="nested",
        concat_dim="time",
        parallel=True,
        engine="netcdf4",
        preprocess=_preprocess
    )

    # Assign fixed lat/lon and real time
    ds = ds.assign_coords({
        "time": ("time", np.array(all_dates, dtype="datetime64[ns]")),
        "lat": ("tile", lat.data),
        "lon": ("tile", lon.data)
    })

    # Mask invalid fill values
    for var in varnames:
        if var in ds:
            ds[var] = ds[var].where(ds[var] < 1e10)

    return ds

In [None]:
ds_ol = build_lsm_dataset(
    root_dir="/discover/nobackup/projects/land_da/Experiment_archive/M21C_land_sweeper_OLv8_M36/LS_OLv8_M36/output/SMAP_EASEv2_M36_GLOBAL/cat/ens_avg",
    file_prefix="LS_OLv8_M36",
    varnames=["SFMC", "RZMC", "PRECTOTCORRLAND", "FRLANDSNO"]
)
ds_da = build_lsm_dataset(
    root_dir="/discover/nobackup/projects/land_da/M21C_land_sweeper/LS_DAv8_M36_v2/LS_DAv8_M36/output/SMAP_EASEv2_M36_GLOBAL/cat/ens_avg",
    file_prefix="LS_DAv8_M36",
    varnames=["SFMC", "RZMC", "PRECTOTCORRLAND", "FRLANDSNO"]
)

ds_da = build_lsm_dataset(
    root_dir="/discover/nobackup/projects/land_da/M21C_land_sweeper/LS_DAv8_M36_v2/LS_DAv8_M36/output/SMAP_EASEv2_M36_GLOBAL/cat/ens_avg",
    file_prefix="LS_DAv8_M36",
    varnames=["SFMC", "RZMC", "PRECTOTCORRLAND", "FRLANDSNO"]
)

In [None]:
# Define compression settings for all variables
comp = dict(zlib=True, complevel=4)  # 4 = good balance

encoding = {var: comp for var in ds_ol.data_vars}
# Save with compression
ds_ol.to_netcdf("OLv8_land_variables_2000_2024_compressed.nc", encoding=encoding)

encoding = {var: comp for var in ds_da.data_vars}
# Save with compression
ds_da.to_netcdf("DAv8_land_variables_2000_2024_compressed.nc", encoding=encoding)

In [None]:
ds_ol = xr.open_dataset("/Users/amfox/Desktop/GEOSldas_diagnostics/test_data/land_sweeper/LS_OLv8_M36/output/SMAP_EASEv2_M36_GLOBAL/cat/OLv8_land_variables_2000_2024_compressed.nc")
ds_da = xr.open_dataset("/Users/amfox/Desktop/GEOSldas_diagnostics/test_data/M21C_land_sweeper/LS_DAv8_M36_v2/LS_DAv8_M36/output/SMAP_EASEv2_M36_GLOBAL/cat/DAv8_land_variables_2000_2024_compressed.nc")

In [None]:
print(ds_ol)
print(ds_da)

In [None]:
print(ds_ol.time)


In [None]:
clim_period = slice("2001-01-01", "2020-12-31")
months = np.arange(1, 13)

variables = ["SFMC", "RZMC", "PRECTOTCORRLAND", "FRLANDSNO"]

clim_ol = {}
clim_da = {}
std_ol = {}
std_da = {}
z_var_ol = {}
z_var_da = {}
z_var_diff = {}
z_var_diff_percent = {}

for var in variables:
    clim_ol[var] = ds_ol[var].sel(time=clim_period).groupby("time.month").mean("time")
    clim_da[var] = ds_da[var].sel(time=clim_period).groupby("time.month").mean("time")

    std_ol[var] = ds_ol[var].sel(time=clim_period).groupby("time.month").std("time")
    std_da[var] = ds_da[var].sel(time=clim_period).groupby("time.month").std("time")

    z_anom_ol_list = []
    z_anom_da_list = []

    for m in months:
        sel_ol = ds_ol[var].sel(time=ds_ol.time.dt.month == m)
        sel_da = ds_da[var].sel(time=ds_da.time.dt.month == m)

        clim_m_ol = clim_ol[var].sel(month=m)
        clim_m_da = clim_da[var].sel(month=m)

        std_m_ol = std_ol[var].sel(month=m)
        std_m_da = std_da[var].sel(month=m)

        # Standardize safely
        z_ol = (sel_ol - clim_m_ol) / std_m_ol.where(std_m_ol > 0)
        z_da = (sel_da - clim_m_da) / std_m_da.where(std_m_da > 0)

        z_anom_ol_list.append(z_ol)
        z_anom_da_list.append(z_da)

    # Combine across months
    z_anom_ol = xr.concat(z_anom_ol_list, dim="time").sortby("time")
    z_anom_da = xr.concat(z_anom_da_list, dim="time").sortby("time")

    # Resample to annual and compute interannual variance
    z_annual_ol = z_anom_ol.resample(time="1Y").mean("time")
    z_annual_da = z_anom_da.resample(time="1Y").mean("time")

    z_var_ol[var] = z_annual_ol.var("time")
    z_var_da[var] = z_annual_da.var("time")
    z_var_diff[var] = z_var_da[var] - z_var_ol[var]
    z_var_diff_percent[var] = 100 * z_var_diff[var] / z_var_ol[var]


In [None]:
var = "RZMC"

print(z_var_diff[var].shape)
print(np.nanmax(z_var_diff[var].values))
print(np.nanmin(z_var_diff[var].values))
print(np.nanmean(z_var_diff[var].values))

In [None]:

ftc = '/Users/amfox/Desktop/GEOSldas_diagnostics/test_data/land_sweeper/LS_OLv8_M36/output/SMAP_EASEv2_M36_GLOBAL/rc_out/LS_OLv8_M36.ldas_tilecoord.bin'
tc = read_tilecoord(ftc)
n_tile = tc['N_tile']
lat = tc['com_lat']
lon = tc['com_lon']

map_array = np.empty([n_tile, 3])
map_array.fill(np.nan)
map_array[:, 1] = lon
map_array[:, 2] = lat


In [None]:
var = "RZMC"

map_array[:, 0] = z_var_ol[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Monthly anomaly variance (standardized) (OL) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-0,
    cmax=4,
    cmap='Blues',
    units='z-score units'
)
fig.tight_layout()

map_array[:, 0] = z_var_da[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Monthly anomaly variance (standardized) (DA) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-0,
    cmax=4,
    cmap='Blues',
    units='z-score units'
)
fig.tight_layout()

map_array[:, 0] = z_var_diff[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Δ Monthly anomaly variance (standardized) (DA - OL) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-2,
    cmax=2,
    units='z-score units'
)
fig.tight_layout()

map_array[:, 0] = z_var_diff_percent[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Δ Monthly anomaly variance (standardized) (DA - OL) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-100,
    cmax=100,
    units='%'
)
fig.tight_layout()

In [None]:
var = "SFMC"

map_array[:, 0] = z_var_ol[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Monthly anomaly variance (standardized) (OL) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-0,
    cmax=4,
    cmap='Blues',
    units='z-score units'
)
fig.tight_layout()

map_array[:, 0] = z_var_da[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Monthly anomaly variance (standardized) (DA) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-0,
    cmax=4,
    cmap='Blues',
    units='z-score units'
)
fig.tight_layout()

map_array[:, 0] = z_var_diff[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Δ Monthly anomaly variance (standardized) (DA - OL) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-4,
    cmax=4,
    units='z-score units'
)
fig.tight_layout()

map_array[:, 0] = z_var_diff_percent[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Δ Monthly anomaly variance (standardized) (DA - OL) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-100,
    cmax=100,
    units='%'
)
fig.tight_layout()


In [None]:
var = "FRLANDSNO"

map_array[:, 0] = z_var_ol[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Monthly anomaly variance (standardized) (OL) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-0,
    cmax=4000,
    cmap='Blues',
    units='z-score units'
)
fig.tight_layout()

map_array[:, 0] = z_var_da[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Monthly anomaly variance (standardized) (DA) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-0,
    cmax=4000,
    cmap='Blues',
    units='z-score units'
)
fig.tight_layout()

map_array[:, 0] = z_var_diff[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Δ Monthly anomaly variance (standardized) (DA - OL) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-4000,
    cmax=4000,
    units='z-score units'
)
fig.tight_layout()

map_array[:, 0] = z_var_diff_percent[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Δ Monthly anomaly variance (standardized) (DA - OL) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmin=-100,
    cmax=100,
    units='%'
)
fig.tight_layout()


In [None]:
var = "PRECTOTCORRLAND"

map_array[:, 0] = z_var_ol[var].values

# Get statistics
maxval = np.nanmax(map_array[:, 0])
minval = np.nanmin(map_array[:, 0])

fig, ax = plot_region(
    map_array,
    region_bounds=REGION_BOUNDS['global'],
    meanflag=True,
    plot_title=f'{var}: Monthly anomaly variance (standardized) (OL) \n (Max: {maxval:.3g} Min: {minval:.3g})',
    cmap='Blues',
    cmin=-0,
    cmax=1,
    units='z-score units'
)
fig.tight_layout()