In [1]:
import os
import glob
import xarray as xr
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [2]:

expt_name = 'LS_OLv8_M36'

start_date = datetime(2000, 6, 1)
end_date = datetime(2007, 6, 1)

start_date_str = start_date.strftime('%Y%m%d')
end_date_str = end_date.strftime('%Y%m%d')

# Define the path directory
# root_directory = f'/Users/amfox/Desktop/GEOSldas_diagnostics/test_data/land_sweeper/{expt_name}/output/SMAP_EASEv2_M36_GLOBAL/cat/ens_avg'
root_directory = f'/discover/nobackup/projects/land_da/Experiment_archive/M21C_land_sweeper_OLv8_M36/{expt_name}/output/SMAP_EASEv2_M36_GLOBAL/cat/ens_avg'

In [3]:
%%time

current_date = start_date
while current_date <= end_date:
    year_month_directory = os.path.join(root_directory, 
                                        f"Y{current_date.year}", 
                                        f"M{current_date.month:02d}")

    # Find the files
    files = glob.glob(f"{year_month_directory}/*tavg24_1d_lnd_Nt*.nc4")

    # Load the data
    data = xr.open_mfdataset(files, combine='nested', concat_dim="time")

    # Define the variables to be extracted
    variables = {
    'sm_surface': 'SFMC',
    'sm_rootzone': 'RZMC',
    'sm_profile': 'PRMC',
    'precipitation_total_surface_flux': 'PRECTOTCORRLAND',
    'vegetation_greenness_fraction': 'GRN',
    'leaf_area_index': 'LAI',
    'snow_mass': 'SNOMASLAND',
    'surface_temperature_of_land_incl_snow': 'TSURFLAND',
    'soil_temperature_layer_1': 'TSOIL1',
    'snowfall_land': 'PRECSNOCORRLAND',
    'snow_depth_within_snow_covered_area_fraction_on_land': 'SNODPLAND',
    'snowpack_evaporation_latent_heat_flux_on_land': 'LHLANDSBLN',
    'overland_runoff_including_throughflow': 'RUNSURFLAND',
    'baseflow_flux_land': 'BASEFLOWLAND',
    'snowmelt_flux_land': 'SMLAND',
    'total_evaporation_land': 'EVLAND',
    'net_shortwave_flux_land': 'SWLAND',
    'total_water_storage_land': 'TWLAND',
    'fractional_area_of_snow_on_land': 'FRLANDSNO'  # New variable added
    }

    # Extract the variables and calculate the mean along the time dimension
    data_extracted = data[list(variables.values())].mean(dim='time')

    # Add a time dimension to the extracted data
    data_extracted = data_extracted.expand_dims(time=[current_date])

    # Save the data
    output_directory = root_directory
    os.makedirs(output_directory, exist_ok=True)
    output_filename = os.path.join(output_directory, f"{expt_name}.tavg24_1d_lnd_Nt.subsetted.{current_date.strftime('%Y%m')}.nc")
    data_extracted.to_netcdf(output_filename)

    # Increment the date
    current_date += relativedelta(months=1)

CPU times: user 4min 25s, sys: 1min 22s, total: 5min 48s
Wall time: 32min 36s
