In [1]:
import os, sys, glob, subprocess, shlex
import xarray as xr
from pathlib import Path

In [None]:
# --------------------------
# CONFIGURE PATHS
# --------------------------
HOURLY_DIR = Path("/vast/bzaitch1/trp_climate_model_data/era5land_1970_2024_hourly")      # directory with hourly files
QUARTER_DIR = Path("/vast/bzaitch1/trp_climate_model_data/era5land_1970_2024_qtrmean")
QUARTER_DIR.mkdir(parents=True, exist_ok=True)    # output: quarterly means

YEARS = range(1980, 1981) # use 1980 which is the first year for testing
QUARTERS = [
    ("01", "02", "03"),  # Q1
    ("04", "05", "06"),  # Q2
    ("07", "08", "09"),  # Q3
    ("10", "11", "12"),  # Q4
]

# Hourly filename pattern:
# One file per month:    YYYY_MM.nc
PATTERN_PER_MONTH = "{y}_{m}.nc"   # for one file per month


# --------------------------
# CONFIGURE ENVIRONMENT
# --------------------------
os.environ.setdefault("HDF5_USE_FILE_LOCKING", "FALSE")
os.environ["NETCDF_HDF5_FILE_LOCKING"] = "FALSE"


In [None]:
# --------------------------
# PROCESS HOURLY → QUARTERLY
# --------------------------
print("Starting hourly to quarterly processing...")
print(f"Input directory: {HOURLY_DIR}")
print(f"Output directory: {QUARTER_DIR}")

total_processed = 0
total_skipped = 0

for y in YEARS:
    y = int(y)
    for q_idx, (m1, m2, m3) in enumerate(QUARTERS, start=1):
        try:
            # Gather monthly hourly files for the quarter
            files_to_process = []
            for month in [m1, m2, m3]:
                monthly_file = HOURLY_DIR / PATTERN_PER_MONTH.format(y=y, m=month)
                if monthly_file.exists():
                    files_to_process.append(str(monthly_file))
                else:
                    print(f"  [warning] Missing: {monthly_file.name}")

            if not files_to_process:
                print(f"[skip] No hourly files for {y} Q{q_idx}")
                total_skipped += 1
                continue

            # Output file
            q_out = QUARTER_DIR / f"{y}_Q{q_idx}_qmean.nc"

            # If already exists, skip (comment out to reprocess)
            if q_out.exists():
                print(f"[exists] {q_out.name} - skipping")
                continue

            print(f"[processing] {y} Q{q_idx} - combining {len(files_to_process)} file(s)")

            # Open all hourly files for the quarter and compute monthly average
            monthly_average_data = []
            for f in files_to_process:
                ds = xr.open_dataset(f, engine='netcdf4')
                monthly_avg = ds.resample(valid_time='M').mean()
                print(f"Original shape: {ds.dims}")
                print(f"Quarterly shape: {monthly_avg.dims}")
                monthly_average_data.append(monthly_avg)

            # Concatenate along time dimension
            combined = xr.concat(monthly_average_data, dim='valid_time')

            # Compute quarterly mean (mean over all hours in the quarter)
            quarterly_mean = combined.mean(dim='valid_time', keep_attrs=True)

            # Add metadata
            quarterly_mean.attrs['description'] = f'Quarterly mean for {y} Q{q_idx}'
            quarterly_mean.attrs['source'] = 'Computed from hourly ERA5-Land data'

            # Save to NetCDF with compression
            encoding = {var: {'zlib': True, 'complevel': 4} for var in quarterly_mean.data_vars}
            quarterly_mean.to_netcdf(q_out, encoding=encoding)

            # Close datasets to free memory
            for ds in monthly_average_data:
                ds.close()
            combined.close()

            print(f"  ✓ Saved: {q_out.name}")
            total_processed += 1

        except Exception as e:
            print(f"[ERROR] Failed to process {y} Q{q_idx}: {e}")
            continue

print(f"\n{'='*60}")
print(f"Processing complete!")
print(f"  Processed: {total_processed}")
print(f"  Skipped: {total_skipped}")
print(f"  Output directory: {QUARTER_DIR}")
print(f"{'='*60}")