In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install required libraries
!pip install -q xarray netCDF4 cftime
!pip install -q xesmf  # Optional, will fallback to basic interpolation if fails

In [None]:
# Upload the src folder to Google Drive, then add to path
import sys
sys.path.insert(0, '/content/drive/MyDrive/Downscaling ML CEP')

from src.data.preprocessors import ClimateDataPreprocessor

In [None]:
# Set paths
BASE_PATH = '/content/drive/MyDrive/Downscaling ML CEP/AI_GCMs'
OUTPUT_DIR = '/content/drive/MyDrive/Downscaling ML CEP/data/processed/train'

print(f"Data path: {BASE_PATH}")
print(f"Output path: {OUTPUT_DIR}")

## Option 1: Process ALL GCMs (Recommended)

This will process all 9 GCMs. Takes ~10-15 minutes on Colab.

In [None]:
# Initialize preprocessor
preprocessor = ClimateDataPreprocessor(
    base_path=BASE_PATH,
    start_year=1980,
    end_year=2014
)

# Process all GCMs
output_path = preprocessor.process_and_save(output_dir=OUTPUT_DIR)

print(f"\n✓ All preprocessing complete!")
print(f"✓ Files saved to: {output_path}")

## Option 2: Process Single GCM (Quick Test)

Test with just one GCM first. Takes ~2-3 minutes.

In [None]:
# Initialize preprocessor
preprocessor = ClimateDataPreprocessor(
    base_path=BASE_PATH,
    start_year=1980,
    end_year=2014
)

# Process only BCC-CSM2-MR for quick test
output_path = preprocessor.process_and_save(
    output_dir=OUTPUT_DIR,
    gcm_models=['BCC-CSM2-MR']  # Just one GCM
)

print(f"\n✓ Single GCM test complete!")
print(f"✓ Files saved to: {output_path}")

## Verify Output Files

In [None]:
import xarray as xr
from pathlib import Path

output_path = Path(OUTPUT_DIR)

print("\n" + "="*80)
print("PROCESSED FILES")
print("="*80)

# List all NetCDF files
nc_files = sorted(output_path.glob('*.nc'))

for i, file in enumerate(nc_files, 1):
    print(f"\n[{i}] {file.name}")
    
    # Quick inspection
    ds = xr.open_dataset(file)
    print(f"    Variables: {list(ds.data_vars)}")
    print(f"    Dimensions: {dict(ds.dims)}")
    print(f"    Time range: {ds.time.values[0]} to {ds.time.values[-1]}")
    
    # Check shapes
    for var in ds.data_vars:
        print(f"    {var}: {ds[var].shape}")
    
    ds.close()

print("\n" + "="*80)
print(f"Total files: {len(nc_files)}")
print("="*80)

## Next Steps

After preprocessing completes successfully:

1. **Feature Engineering**: Run `src/data/loaders.py` to create training DataFrames
2. **Model Training**: Run `src/models/train.py` to train ML models
3. **Complete Workflow**: Use `notebooks/02_complete_workflow.ipynb` for end-to-end execution

Expected files after full preprocessing:
- `cru_1980_2014.nc` (1 file)
- `era5_1980_2014.nc` (1 file)
- `{GCM_name}_hist_1980_2014.nc` (9 files, one per GCM)

**Total: 11 NetCDF files ready for training**