## 1. Setup and Configuration

In [None]:
# Standard library imports
import warnings
import time
import sys
from pathlib import Path

# Core scientific computing
import numpy as np
import pandas as pd
import xarray as xr

# Add the examples directory to path for local imports
# This handles JupyterHub where the working directory may differ
notebook_dir = Path(__file__).parent if '__file__' in dir() else Path.cwd()
if str(notebook_dir) not in sys.path:
    sys.path.insert(0, str(notebook_dir))

# climakitae imports
from climakitae.new_core.user_interface import ClimateData

# Suppress some warnings for cleaner output
warnings.filterwarnings('ignore', category=FutureWarning)

print("✓ Imports successful")

✓ Imports successful


In [8]:
# Configuration parameters
from lat_lons import lat_lons
CONFIG = {
    # Spatial configuration
    "region": lat_lons,  # Pacific Gas & Electric service territory
    
    # Warming levels (degrees C above pre-industrial baseline)
    "warming_levels": [1.5, 2.0],
    
    # 1-in-X analysis configuration
    "return_periods": [10, 50, 100],  # Years
    "distribution": "gev",  # Generalized Extreme Value distribution
    "extremes_type": "max",  # Analyzing extreme maximums
    "event_duration": (1, "day"),  # 1-day events
    "block_size": 1,  # Annual blocks
    
    # Data configuration
    "variable": "tasmax",  # Daily maximum temperature
    "activity_id": "LOCA2",  # LOCA2 downscaled data
    "table_id": "day",  # Daily data
    "grid_label": "d03",  # 3km resolution
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

ModuleNotFoundError: No module named 'lat_lons'

## 2. Initialize ClimateData Interface

In [3]:
# Initialize the ClimateData interface
cd = ClimateData()
print("✓ ClimateData interface initialized")

# Show available options for exploration
print("\n--- Available Catalogs ---")
cd.show_catalog_options()

2025-12-23 17:39:08 - climakitae.new_core.user_interface - INFO - Initializing ClimateData interface
2025-12-23 17:39:08 - climakitae.new_core.dataset_factory - INFO - DatasetFactory initialized with 2 validators and 10 processors
2025-12-23 17:39:08 - climakitae.new_core.user_interface - INFO - ClimateData initialization successful
2025-12-23 17:39:08 - climakitae.new_core.user_interface - INFO - ✅ Ready to query!
✓ ClimateData interface initialized

--- Available Catalogs ---
2025-12-23 17:39:08 - climakitae.new_core.user_interface - INFO - catalog options (Cloud data collections):
2025-12-23 17:39:08 - climakitae.new_core.user_interface - INFO - -----------------------------------------
catalog options (Cloud data collections):
-----------------------------------------
2025-12-23 17:39:08 - climakitae.new_core.user_interface - INFO - cadcat
2025-12-23 17:39:08 - climakitae.new_core.user_interface - INFO - renewable energy generation
2025-12-23 17:39:08 - climakitae.new_core.user_int

## 5. Apply 1-in-X Analysis

Now we apply the `metric_calc` processor with `one_in_x` configuration to calculate extreme value return periods.

This is the stress test target - the 1-in-X analysis:
- Fits GEV distributions to annual block maxima
- Calculates return values for specified return periods
- Processes across all warming levels, simulations, and spatial grid cells

In [4]:
# Configure the 1-in-X analysis
one_in_x_config = {
    "one_in_x": {
        "return_periods": CONFIG["return_periods"],
        "distribution": CONFIG["distribution"],
        "extremes_type": CONFIG["extremes_type"],
        "event_duration": CONFIG["event_duration"],
        "block_size": CONFIG["block_size"],
        "goodness_of_fit_test": True,
        "print_goodness_of_fit": True,
    }
}

print("1-in-X Configuration:")
for key, value in one_in_x_config["one_in_x"].items():
    print(f"  {key}: {value}")

1-in-X Configuration:
  return_periods: [10, 50, 100]
  distribution: gev
  extremes_type: max
  event_duration: (1, 'day')
  block_size: 1
  goodness_of_fit_test: True
  print_goodness_of_fit: True


In [5]:
# STRESS TEST: Apply 1-in-X analysis to warming level data
# This is computationally intensive - fitting distributions to each simulation/grid cell combination

print("\n" + "="*60)
print("STRESS TEST: 1-in-X Extreme Value Analysis")
print("="*60)
print(f"Region: {CONFIG['region']}")
print(f"Warming Levels: {CONFIG['warming_levels']}")
print(f"Return Periods: {CONFIG['return_periods']}")
print(f"Distribution: {CONFIG['distribution']}")
print("\nStarting computation...")

start_time = time.time()

# Reset and create a new query that combines all processors
cd_fresh = ClimateData()

# Full pipeline: data retrieval -> warming levels -> clipping -> metric calculation
full_processors = {
    "clip": CONFIG["region"],
    "warming_level": {
        "warming_levels": CONFIG["warming_levels"],
        "warming_level_window": 15,
    },
    "metric_calc": one_in_x_config,
}

%time
result = (
    cd_fresh
    .catalog("cadcat")
    .activity_id(CONFIG["activity_id"])
    .table_id(CONFIG["table_id"])
    .grid_label(CONFIG["grid_label"])
    .variable(CONFIG["variable"])
    .processes(full_processors)
    .get()
)

elapsed_time = time.time() - start_time
print(f"\n✓ 1-in-X analysis completed in {elapsed_time:.1f} seconds")
print(f"  ({elapsed_time/60:.1f} minutes)")


STRESS TEST: 1-in-X Extreme Value Analysis
Region: [(37.2031250002, -119.359375), (37.4531250001, -118.171875), (37.4218750002, -118.140625), (37.0156250001, -118.203125), (37.078125, -118.234375), (37.1718750002, -119.265625), (37.3906250002, -118.359375), (37.3281249999, -118.484375), (37.6093750003, -118.390625), (36.9843750002, -119.484375), (37.4218750002, -118.109375), (37.4531250001, -118.140625), (37.2031250002, -119.328125), (37.046875, -119.484375), (37.1406249998, -118.265625), (37.4531250001, -117.953125), (37.1093749999, -119.515625), (36.9843750002, -119.453125), (37.2031250002, -119.296875), (37.3281249999, -118.453125), (37.3281249999, -118.421875), (37.2031250002, -119.265625), (37.3281249999, -118.390625), (37.046875, -119.390625), (37.1718750002, -118.328125), (37.046875, -119.359375), (37.4218750002, -118.078125), (37.5781249999, -118.390625), (37.3906250002, -118.328125), (37.1093749999, -119.484375), (36.9843750002, -119.421875), (37.4531250001, -117.921875), (37

KeyboardInterrupt: 

In [None]:
# Inspect the results
print("\n--- 1-in-X Analysis Results ---")
if result is not None:
    print(f"Type: {type(result)}")
    print(f"Dimensions: {dict(result.dims)}")
    print(f"Coordinates: {list(result.coords.keys())}")
    print(f"Data variables: {list(result.data_vars.keys())}")
    
    # Check for return_period dimension
    if 'return_period' in result.dims:
        print(f"\nReturn periods in data: {result.coords['return_period'].values}")
    
    # Check for warming_level dimension
    if 'warming_level' in result.dims:
        print(f"Warming levels in data: {result.coords['warming_level'].values}")
    
    print(f"\nFull dataset overview:")
    print(result)
else:
    print("⚠ No results returned - check configuration and data availability")

## 6. Analyze Results

If the stress test completed successfully, let's analyze the results.

In [None]:
# Statistical summary of results
if result is not None:
    print("\n--- Statistical Summary ---")
    
    for var_name in result.data_vars:
        var_data = result[var_name]
        print(f"\n{var_name}:")
        
        # Compute basic statistics (may need to call .compute() for dask arrays)
        try:
            computed = var_data.compute() if hasattr(var_data.data, 'compute') else var_data
            print(f"  Min: {float(computed.min()):.2f}")
            print(f"  Max: {float(computed.max()):.2f}")
            print(f"  Mean: {float(computed.mean()):.2f}")
            print(f"  Std: {float(computed.std()):.2f}")
            print(f"  NaN count: {int(computed.isnull().sum())}")
            print(f"  Total values: {int(computed.size)}")
        except Exception as e:
            print(f"  Could not compute statistics: {e}")
else:
    print("No results to analyze")

In [None]:
# Display results by warming level (if applicable)
if result is not None and 'warming_level' in result.dims:
    print("\n--- Results by Warming Level ---")
    
    for wl in result.coords['warming_level'].values:
        print(f"\nWarming Level: {wl}°C")
        wl_data = result.sel(warming_level=wl)
        
        for var_name in wl_data.data_vars:
            var = wl_data[var_name]
            try:
                computed = var.compute() if hasattr(var.data, 'compute') else var
                print(f"  {var_name}: mean={float(computed.mean()):.2f}, std={float(computed.std()):.2f}")
            except Exception as e:
                print(f"  {var_name}: Could not compute - {e}")

## 7. Test Summary

This notebook stress-tested the 1-in-X extreme value analysis with:
- **Region**: PG&E service territory (large spatial extent)
- **Warming Levels**: 1.5°C, 2.0°C, 3.0°C (3 climate scenarios)
- **Return Periods**: 10, 25, 50, 100 years
- **Distribution**: GEV (Generalized Extreme Value)
- **Variable**: tasmax (daily maximum temperature)
- **Resolution**: d03 (3km)

### Expected Outcomes
- Data should be retrieved and transformed to warming level approach
- GEV distributions should be fit to annual block maxima at each grid cell
- Return values should be calculated for each specified return period
- Results should have dimensions for: warming_level, return_period, lat, lon (and possibly simulation)

### Key Performance Metrics
- Data retrieval time
- 1-in-X computation time
- Memory usage (check system monitoring)
- Number of successful/failed distribution fits (if goodness_of_fit_test=True)

In [None]:
# Final summary
print("\n" + "="*60)
print("STRESS TEST SUMMARY")
print("="*60)

if result is not None:
    print(f"✓ Test PASSED - Results obtained")
    print(f"  Result type: {type(result).__name__}")
    print(f"  Dimensions: {dict(result.dims)}")
    print(f"  Variables: {list(result.data_vars.keys())}")
    
    # Calculate approximate memory footprint
    try:
        nbytes = result.nbytes
        print(f"  Memory footprint: {nbytes / (1024**2):.1f} MB")
    except:
        print(f"  Memory footprint: Unable to calculate")
else:
    print(f"✗ Test FAILED - No results returned")
    print(f"  Check configuration and data availability")

print("\n" + "="*60)

In [None]:
for c in gateway.list_clusters():
    cluster = gateway.connect(c.name, shutdown_on_close=True)
    cluster.shutdown()