In [1]:
import xarray as xr
import os
from pathlib import Path
# Configure Dask with MULTICORE processing for maximum speed
import dask
from dask.distributed import Client, LocalCluster
from dask.diagnostics import ProgressBar
import multiprocessing

# Get number of CPU cores
n_cores = multiprocessing.cpu_count()
print(f"Detected {n_cores} CPU cores")

# Create a local cluster with multiple workers (one per core)
# Adjust n_workers and threads_per_worker based on your system
cluster = LocalCluster(
    n_workers=n_cores - 1,  # Leave 1 core free for system
    threads_per_worker=2,    # 2 threads per worker
    memory_limit='auto',     # Auto-detect memory per worker
    processes=True,          # Use processes (not threads) for true parallelism
    dashboard_address=':8787'  # Optional: view dashboard at localhost:8787
)

# Connect to the cluster
client = Client(cluster)

print(f"Dask Client initialized with {n_cores - 1} workers")
print(f"Dashboard available at: {client.dashboard_link}")
print(client)

# Configure Dask settings for better performance
dask.config.set({
    'array.slicing.split_large_chunks': True,
    'distributed.worker.memory.target': 0.7,
    'distributed.worker.memory.spill': 0.8,
    'distributed.worker.memory.pause': 0.9,
    'distributed.scheduler.worker-saturation': 1.1,  # Allow slight oversubscription
})

print("\nDask cluster ready for parallel processing!")

Detected 32 CPU cores
Dask Client initialized with 31 workers
Dashboard available at: http://127.0.0.1:8787/status
<Client: 'tcp://127.0.0.1:61614' processes=31 threads=62, memory=123.56 GiB>

Dask cluster ready for parallel processing!


In [2]:
HOURLY_DIR = Path(rf"E:\backup\era5land_1970_2024_hourly")      # directory with hourly files
#MONTH_DIR = Path(rf"E:\backup\trp_climate_model_data\era5land_1970_2024_monthmean")
QUARTER_DIR = Path(rf"E:\backup\trp_climate_model_data\era5land_1970_2024_qtrmean")
#MONTH_DIR.mkdir(parents=True, exist_ok=True)    # output: quarterly means
QUARTER_DIR.mkdir(parents=True, exist_ok=True)    # output: quarterly means

# Configure the year range you want to process
YEARS = range(2020, 2025)  # Process all years from 1980 to 2024

# To test with a smaller range, use: YEARS = range(1980, 1981)
QUARTERS = [
    ("01", "02", "03"),  # Q1
    ("04", "05", "06"),  # Q2
    ("07", "08", "09"),  # Q3
    ("10", "11", "12"),  # Q4
]

# Hourly filename pattern:
# One file per month:    YYYY_MM.nc
PATTERN_PER_MONTH = "{y}_{m}.nc"   # for one file per month
os.environ.setdefault("HDF5_USE_FILE_LOCKING", "FALSE")

'FALSE'

In [None]:
# DIAGNOSTIC: Check dataset structure to understand potential KeyError issues
# FIXED: Handle all problematic variables including 'expver'

# Open a sample file to inspect
sample_file = 'E:\\backup\\download\\2000_01.nc'
ds_inspect = xr.open_dataset(sample_file, engine='netcdf4')

print("="*60)
print("DATASET INSPECTION")
print("="*60)
print(f"\nDimensions: {dict(ds_inspect.dims)}")
print(f"\nCoordinates:")
for coord in ds_inspect.coords:
    dtype = ds_inspect.coords[coord].dtype
    shape = ds_inspect.coords[coord].shape
    print(f"  - {coord}: {dtype}, shape={shape}")

print(f"\nData Variables:")
for var in ds_inspect.data_vars:
    dtype = ds_inspect[var].dtype
    shape = ds_inspect[var].shape
    print(f"  - {var}: {dtype}, shape={shape}")

print(f"\nAttributes: {list(ds_inspect.attrs.keys())}")

# Check for problematic variables that can't be averaged
print("\n" + "="*60)
print("CHECKING FOR PROBLEMATIC VARIABLES")
print("="*60)

# Drop scalar coordinates and object types before resampling
vars_to_drop = []

# Check all coordinates
for coord in ds_inspect.coords:
    coord_var = ds_inspect.coords[coord]
    
    # Drop if it's object type (like 'expver')
    if coord_var.dtype == 'object' or coord_var.dtype.kind == 'O' or coord_var.dtype.kind == 'U':
        vars_to_drop.append(coord)
        print(f"  Will drop coordinate: {coord} (object/string type, can't average)")
    # Drop if it's scalar (like 'number')
    elif coord_var.ndim == 0:
        vars_to_drop.append(coord)
        print(f"  Will drop coordinate: {coord} (scalar, can't resample)")
    # Drop if it doesn't have valid_time dimension (except lat/lon)
    elif 'valid_time' not in coord_var.dims and coord not in ['latitude', 'longitude', 'valid_time']:
        vars_to_drop.append(coord)
        print(f"  Will drop coordinate: {coord} (no time dimension)")

# Check data variable types
for var in ds_inspect.data_vars:
    if ds_inspect[var].dtype == 'object' or ds_inspect[var].dtype.kind == 'O' or ds_inspect[var].dtype.kind == 'U':
        vars_to_drop.append(var)
        print(f"  Will drop variable: {var} (object type, can't average)")

if vars_to_drop:
    print(f"\nDropping {len(vars_to_drop)} problematic variables: {vars_to_drop}")
    ds_clean = ds_inspect.drop_vars(vars_to_drop)
else:
    print(f"\nNo problematic variables found")
    ds_clean = ds_inspect

# Check after resampling
print("\n" + "="*60)
print("AFTER RESAMPLING")
print("="*60)

try:
    monthly_inspect = ds_clean.resample(valid_time='M').mean()

    print(f"\nDimensions: {dict(monthly_inspect.dims)}")
    print(f"\nCoordinates:")
    for coord in monthly_inspect.coords:
        print(f"  - {coord}: {monthly_inspect.coords[coord].dtype}")

    print(f"\nData Variables:")
    for var in monthly_inspect.data_vars:
        print(f"  - {var}: {monthly_inspect[var].dtype}")
    
    print("\n✓ Resampling successful!")
    
    # Clean up
    monthly_inspect.close()
    
except Exception as e:
    print(f"\n✗ ERROR during resampling: {e}")
    import traceback
    traceback.print_exc()

ds_clean.close()
ds_inspect.close()

print("\n" + "="*60)

In [3]:
# COMPLETE WORKFLOW: Process ALL YEARS and QUARTERS with multicore processing
# FIXED: Handles ALL problematic variables including 'number' and 'expver'
# FIXED: Compute data in memory before saving to avoid shape mismatch with resample()

import time

# --------------------------
# PROCESS HOURLY → QUARTERLY
# --------------------------
print("Starting hourly to quarterly processing...")
print(f"Input directory: {HOURLY_DIR}")
print(f"Output directory: {QUARTER_DIR}")
print(f"Processing years: {list(YEARS)}")
print(f"{'='*60}\n")

total_processed = 0
total_skipped = 0
total_errors = 0
overall_start = time.time()

for y in YEARS:
    y = int(y)
    for q_idx, (m1, m2, m3) in enumerate(QUARTERS, start=1):
        try:
            # Gather monthly hourly files for the quarter
            files_to_process = []
            for month in [m1, m2, m3]:
                monthly_file = HOURLY_DIR / PATTERN_PER_MONTH.format(y=y, m=month)
                if monthly_file.exists():
                    files_to_process.append(str(monthly_file))
                else:
                    print(f"  [warning] Missing: {monthly_file.name}")

            if not files_to_process:
                print(f"[skip] No hourly files for {y} Q{q_idx}")
                total_skipped += 1
                continue

            # Output file
            q_out = QUARTER_DIR / f"{y}_Q{q_idx}_qmean.nc"

            # If already exists, skip (comment out to reprocess)
            if q_out.exists():
                print(f"[exists] {q_out.name} - skipping")
                continue

            print(f"[processing] {y} Q{q_idx} - combining {len(files_to_process)} file(s)")
            start_time = time.time()

            # Open all files with parallel reading enabled
            combined_ds = xr.open_mfdataset(
                files_to_process,
                engine='netcdf4',
                chunks={'valid_time': 100, 'latitude': 600, 'longitude': 600},
                parallel=True,
                combine='by_coords',
                coords='minimal',
                compat='override'
            )

            print(f"  Loaded: {combined_ds.dims}")

            # FIX: Drop ALL problematic variables before resampling
            vars_to_drop = []
            
            # Check all coordinates for problematic types
            for coord in combined_ds.coords:
                coord_var = combined_ds.coords[coord]
                
                # Drop object/string types (like 'expver')
                if coord_var.dtype == 'object' or coord_var.dtype.kind == 'O' or coord_var.dtype.kind == 'U':
                    vars_to_drop.append(coord)
                # Drop scalar coordinates (like 'number')
                elif coord_var.ndim == 0:
                    vars_to_drop.append(coord)
                # Drop non-time-varying coordinates (except lat/lon)
                elif 'valid_time' not in coord_var.dims and coord not in ['latitude', 'longitude', 'valid_time']:
                    vars_to_drop.append(coord)
            
            # Drop object-type data variables
            for var in combined_ds.data_vars:
                if combined_ds[var].dtype == 'object' or combined_ds[var].dtype.kind == 'O' or combined_ds[var].dtype.kind == 'U':
                    vars_to_drop.append(var)
            
            if vars_to_drop:
                print(f"  Dropping {len(vars_to_drop)} problematic variable(s): {vars_to_drop}")
                combined_ds = combined_ds.drop_vars(vars_to_drop)

            # Resample to quarterly
            print(f"  Resampling to quarterly frequency...")
            quarterly_avg = combined_ds.resample(valid_time='Q').mean()
            
            # Add metadata
            quarterly_avg.attrs['year'] = y
            quarterly_avg.attrs['quarter'] = q_idx
            quarterly_avg.attrs['description'] = f'Quarterly mean for {y} Q{q_idx}'
            quarterly_avg.attrs['source'] = 'ERA5-Land hourly data'

            print(f"  Quarterly shape: {quarterly_avg.dims}")
            print(f"  Computing in parallel...")

            # FIXED: Compute the data into memory BEFORE saving to avoid chunking/coordinate mismatch
            quarterly_avg_computed = quarterly_avg.compute()

            # Build encoding dict - ONLY for data variables, NOT coordinates
            encoding = {}
            for var in quarterly_avg_computed.data_vars:
                encoding[var] = {
                    'zlib': True, 
                    'complevel': 4, 
                    'dtype': 'float32',
                    '_FillValue': -9999.0
                }

            # Save the computed data (no compute needed, data already in memory)
            print(f"  Saving to disk...")
            quarterly_avg_computed.to_netcdf(q_out, encoding=encoding)

            # Cleanup
            quarterly_avg_computed.close()
            combined_ds.close()

            elapsed = time.time() - start_time
            total_processed += 1
            
            print(f"  ✓ Completed: {q_out.name} ({elapsed:.2f}s)\n")

        except Exception as e:
            print(f"[ERROR] Failed to process {y} Q{q_idx}: {e}")
            import traceback
            traceback.print_exc()
            total_errors += 1
            continue

# Summary
overall_elapsed = time.time() - overall_start
print(f"\n{'='*60}")
print(f"PROCESSING COMPLETE!")
print(f"{'='*60}")
print(f"Total time: {overall_elapsed:.2f} seconds ({overall_elapsed/60:.2f} minutes)")
print(f"Processed: {total_processed} quarters")
print(f"Skipped (existing): {total_skipped} quarters")
print(f"Errors: {total_errors} quarters")
if total_processed > 0:
    print(f"Average time per quarter: {overall_elapsed/total_processed:.2f} seconds")
print(f"Output directory: {QUARTER_DIR}")
print(f"{'='*60}")

Starting hourly to quarterly processing...
Input directory: E:\backup\era5land_1970_2024_hourly
Output directory: E:\backup\trp_climate_model_data\era5land_1970_2024_qtrmean
Processing years: [2020, 2021, 2022, 2023, 2024]

[exists] 2020_Q1_qmean.nc - skipping
[exists] 2020_Q2_qmean.nc - skipping
[exists] 2020_Q3_qmean.nc - skipping
[exists] 2020_Q4_qmean.nc - skipping
[exists] 2021_Q1_qmean.nc - skipping
[exists] 2021_Q2_qmean.nc - skipping
[exists] 2021_Q3_qmean.nc - skipping
[exists] 2021_Q4_qmean.nc - skipping
[exists] 2022_Q1_qmean.nc - skipping
[exists] 2022_Q2_qmean.nc - skipping
[exists] 2022_Q3_qmean.nc - skipping
[exists] 2022_Q4_qmean.nc - skipping
[processing] 2023 Q1 - combining 3 file(s)
  Loaded: Frozen({'valid_time': 2160, 'latitude': 1801, 'longitude': 3600})
  Dropping 2 problematic variable(s): ['number', 'expver']
  Resampling to quarterly frequency...
  Quarterly shape: Frozen({'valid_time': 1, 'latitude': 1801, 'longitude': 3600})
  Computing in parallel...
  Savi

## Test File integrity

In [5]:
# --------------------------
# FILE INTEGRITY CHECKER
# --------------------------

def check_file_integrity(file_path, expected_vars=None, expected_dims=None, verbose=True):
    """
    Check the integrity of a NetCDF file.
    
    Parameters:
    -----------
    file_path : Path or str
        Path to the NetCDF file
    expected_vars : list, optional
        List of expected variable names
    expected_dims : list, optional
        List of expected dimension names
    verbose : bool
        Print detailed information
    
    Returns:
    --------
    dict : Dictionary with integrity check results
    """
    results = {
        'file': str(file_path),
        'exists': False,
        'readable': False,
        'has_data': False,
        'has_expected_vars': False,
        'has_expected_dims': False,
        'errors': [],
        'warnings': [],
        'info': {}
    }
    
    try:
        # Check if file exists
        file_path = Path(file_path)
        if not file_path.exists():
            results['errors'].append("File does not exist")
            return results
        results['exists'] = True
        
        # Check file size
        file_size = file_path.stat().st_size
        results['info']['file_size_mb'] = round(file_size / (1024 * 1024), 2)
        
        if file_size == 0:
            results['errors'].append("File is empty (0 bytes)")
            return results
        
        # Try to open the file
        try:
            ds = xr.open_dataset(file_path)
            results['readable'] = True
        except Exception as e:
            results['errors'].append(f"Cannot open file: {str(e)}")
            return results
        
        # Check if dataset has data
        if len(ds.data_vars) == 0:
            results['errors'].append("No data variables found")
        else:
            results['has_data'] = True
            results['info']['num_vars'] = len(ds.data_vars)
            results['info']['variables'] = list(ds.data_vars)
        
        # Check dimensions
        results['info']['dimensions'] = dict(ds.dims)
        
        # Check for expected variables
        if expected_vars:
            missing_vars = set(expected_vars) - set(ds.data_vars)
            if missing_vars:
                results['warnings'].append(f"Missing expected variables: {missing_vars}")
            else:
                results['has_expected_vars'] = True
        
        # Check for expected dimensions
        if expected_dims:
            missing_dims = set(expected_dims) - set(ds.dims)
            if missing_dims:
                results['warnings'].append(f"Missing expected dimensions: {missing_dims}")
            else:
                results['has_expected_dims'] = True
        
        # Check for NaN or infinite values in each variable
        for var in ds.data_vars:
            try:
                data = ds[var]
                # Check dtype
                if data.dtype == 'object':
                    results['warnings'].append(f"Variable '{var}' has object dtype (cannot be averaged)")
                    continue
                
                # For numeric data, check for issues
                if data.dtype.kind in ['f', 'i', 'u']:  # float, int, unsigned int
                    # Sample check (don't load entire array if it's huge)
                    #if data.size > 1000000:  # If larger than 1M elements, sample
                        #sample = data.isel({dim: slice(0, 100) for dim in data.dims})
                    #else:
                    sample = data
                    
                    # Load sample into memory
                    sample_values = sample.values
                    
                    # Check for all NaN
                    if np.all(np.isnan(sample_values)):
                        results['warnings'].append(f"Variable '{var}' appears to be all NaN")
                    
                    # Check for any infinite values
                    if np.any(np.isinf(sample_values)):
                        results['warnings'].append(f"Variable '{var}' contains infinite values")
                        
            except Exception as e:
                results['warnings'].append(f"Could not check variable '{var}': {str(e)}")
        
        # Check time dimension if present
        if 'valid_time' in ds.dims:
            results['info']['time_steps'] = ds.dims['valid_time']
            try:
                time_values = ds['valid_time'].values
                results['info']['time_range'] = {
                    'start': str(time_values[0]),
                    'end': str(time_values[-1])
                }
            except:
                results['warnings'].append("Could not read time values")
        
        ds.close()
        
        # Overall status
        if not results['errors']:
            results['status'] = 'PASS' if not results['warnings'] else 'PASS_WITH_WARNINGS'
        else:
            results['status'] = 'FAIL'
        
    except Exception as e:
        results['errors'].append(f"Unexpected error: {str(e)}")
        results['status'] = 'FAIL'
    
    # Print results if verbose
    if verbose:
        print(f"\n{'='*70}")
        print(f"File: {results['file']}")
        print(f"Status: {results['status']}")
        print(f"{'='*70}")
        
        if results['info']:
            print("\nInfo:")
            for key, val in results['info'].items():
                print(f"  {key}: {val}")
        
        if results['warnings']:
            print("\nWarnings:")
            for warn in results['warnings']:
                print(f"  ⚠ {warn}")
        
        if results['errors']:
            print("\nErrors:")
            for err in results['errors']:
                print(f"  ✗ {err}")
        
        if results['status'] == 'PASS':
            print("\n✓ File integrity check PASSED")
    
    return results


def check_multiple_files(file_pattern, expected_vars=None, expected_dims=None):
    """
    Check integrity of multiple files matching a pattern.
    
    Parameters:
    -----------
    file_pattern : str
        Glob pattern for files to check (e.g., "data/*.nc")
    expected_vars : list, optional
        List of expected variable names
    expected_dims : list, optional
        List of expected dimension names
    
    Returns:
    --------
    dict : Summary of all file checks
    """
    import glob
    
    files = sorted(glob.glob(str(file_pattern)))
    
    if not files:
        print(f"No files found matching pattern: {file_pattern}")
        return {}
    
    print(f"Checking {len(files)} file(s)...")
    
    results = {}
    summary = {
        'total': len(files),
        'passed': 0,
        'passed_with_warnings': 0,
        'failed': 0
    }
    
    for file in files:
        result = check_file_integrity(file, expected_vars, expected_dims, verbose=False)
        results[file] = result
        
        if result['status'] == 'PASS':
            summary['passed'] += 1
        elif result['status'] == 'PASS_WITH_WARNINGS':
            summary['passed_with_warnings'] += 1
        else:
            summary['failed'] += 1
    
    # Print summary
    print(f"\n{'='*70}")
    print("SUMMARY")
    print(f"{'='*70}")
    print(f"Total files: {summary['total']}")
    print(f"Passed: {summary['passed']}")
    print(f"Passed with warnings: {summary['passed_with_warnings']}")
    print(f"Failed: {summary['failed']}")
    
    # Show failed files
    if summary['failed'] > 0:
        print(f"\nFailed files:")
        for file, result in results.items():
            if result['status'] == 'FAIL':
                print(f"  ✗ {Path(file).name}")
                for err in result['errors']:
                    print(f"      - {err}")
    
    # Show warnings
    files_with_warnings = [f for f, r in results.items() if r['warnings']]
    if files_with_warnings:
        print(f"\nFiles with warnings: {len(files_with_warnings)}")
        for file in files_with_warnings[:5]:  # Show first 5
            print(f"  ⚠ {Path(file).name}")
            for warn in results[file]['warnings'][:2]:  # Show first 2 warnings
                print(f"      - {warn}")
    
    return results

import numpy as np

print("File integrity checker functions loaded!")

File integrity checker functions loaded!


In [6]:

# --------------------------
# USAGE EXAMPLES FOR FILE INTEGRITY CHECKER
# --------------------------

#Example 1: Check a single file
#result = check_file_integrity(HOURLY_DIR / "1980_01.nc")

#Example 2: Check a single file with expected variables and dimensions
# result = check_file_integrity(
#     HOURLY_DIR / "1980_01.nc",
#     expected_vars=['t2m', 'tp'],  # temperature and precipitation
#     expected_dims=['time', 'latitude', 'longitude']
# )

#Example 3: Check all hourly files for a specific year
#results = check_multiple_files(str(QUARTER_DIR / "2004_*.nc"))

#Example 4: Check all quarterly output files
results = check_multiple_files(str(QUARTER_DIR / "*_qmean.nc"))

#Example 5: Check all hourly files in the directory
#results = check_multiple_files(str(HOURLY_DIR / "*.nc"))

#print("Uncomment the examples above to run integrity checks")

Checking 180 file(s)...

SUMMARY
Total files: 180
Passed: 180
Failed: 0


### Plotting

In [11]:
ds = xr.open_dataset(rf'E:\backup\trp_climate_model_data\era5land_1970_2024_qtrmean\2022_Q4_qmean.nc', engine='netcdf4')
ds

In [31]:
Q4 = [rf'E:/backup/download/2004_10.nc',
 rf'E:/backup/download/2004_11.nc',
 rf'E:/backup/download/2004_12.nc']

In [40]:
ds = xr.open_dataset(rf'E:/backup/download/2004_12.nc', engine='netcdf4')
ds['e'].values
ds.close()