In [1]:
import numpy as np
import pandas as pd
import torch
import pyro
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam
import pyro.distributions as dist
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

# For reproducibility
np.random.seed(0)
torch.manual_seed(0)
pyro.set_rng_seed(0)


# M4: Multistage (k=8, inner=4)

In [2]:
import pandas as pd

# Define the path to your CSV file
file_path = "cleaned_hurricane_damage_data.csv"

# Load the CSV file
df = pd.read_csv(file_path)

# Display the first 5 rows
print(df.head())


   B01001_001E  Household Income Distribution  Median Household Income  \
0       1675.0                          860.0                  75444.0   
1       2221.0                         1070.0                 140313.0   
2       1904.0                         1098.0                  83750.0   
3       1147.0                          517.0                  62054.0   
4       4946.0                         3231.0                  29737.0   

   B19001_002E  B19001_003E  B19001_004E  B19001_005E  B19001_006E  \
0         23.0         98.0         21.0         10.0         52.0   
1         45.0         24.0         16.0         17.0          0.0   
2         72.0         76.0         50.0         72.0          0.0   
3          0.0          0.0          0.0         60.0         61.0   
4        373.0        471.0        391.0        176.0        217.0   

   B19001_007E  B19001_008E  ...  Longitude  storm_county    ppt_mean  \
0         10.0          9.0  ...   -80.3117  201007_12086  11

  df = pd.read_csv(file_path)


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# ─────────────────────────── Column Mapping ───────────────────────────
census_svi_column_mapping = {
    'B01001_001E': 'Total Population',
    'B19001_001E': 'Household Income Distribution',
    'B19013_001E': 'Median Household Income',
    'B19001_002E': 'Income Bracket 1',
    'B19001_003E': 'Income Bracket 2',
    'B19001_004E': 'Income Bracket 3',
    'B19001_005E': 'Income Bracket 4',
    'B19001_006E': 'Income Bracket 5',
    'B19001_007E': 'Income Bracket 6',
    'B19001_008E': 'Income Bracket 7',
    'B19001_009E': 'Income Bracket 8',
    'B19001_010E': 'Income Bracket 9',
    'B19001_011E': 'Income Bracket 10',
    'B19001_012E': 'Income Bracket 11',
    'B19001_013E': 'Income Bracket 12',
    'B19001_014E': 'Income Bracket 13',
    'B19001_015E': 'Income Bracket 14',
    'B19001_016E': 'Income Bracket 15',
    'B19001_017E': 'Income Bracket 16',
    'state': 'State',
    'county': 'County',
    'tract': 'Census Tract',
    'year': 'Year',
    'STATE_COUNTY_FIPS': 'State-County FIPS Code',
    'GEOID': 'Geographic Identifier',
    'FIPS': 'FIPS Code',
    'RPL_THEMES': 'SVI Themes',
    'CZ_FIPS': 'Combined Statistical Area FIPS',
    'ST': 'State Abbreviation',
    'LOCATION': 'Location',
    'E_TOTPOP': 'Estimated Total Population',
    'M_TOTPOP': 'Margin of Error Total Population',
    'E_HU': 'Estimated Housing Units',
    'M_HU': 'Margin of Error Housing Units',
    'E_UNEMP': 'Estimated Unemployed',
    'M_UNEMP': 'Margin of Error Unemployed',
    'E_LIMENG': 'Estimated Limited English Proficiency',
    'M_LIMENG': 'Margin of Error Limited English Proficiency',
    'E_MUNIT': 'Estimated Multi-Unit Housing',
    'M_MUNIT': 'Margin of Error Multi-Unit Housing',
    'E_MOBILE': 'Estimated Mobile Homes',
    'M_MOBILE': 'Margin of Error Mobile Homes',
    'E_CROWD': 'Estimated Crowded Housing',
    'M_CROWD': 'Margin of Error Crowded Housing',
    'E_NOVEH': 'Estimated No Vehicle',
    'M_NOVEH': 'Margin of Error No Vehicle',
    'DAMAGE_PROPERTY': 'Damage to Property',
    'DAMAGE_CROPS': 'Damage to Crops',
    'BEGIN_YEARMONTH': 'Begin Year-Month',
    'LAT': 'Latitude',
    'LON': 'Longitude',
    'storm_county': 'Storm County'
}



# ─────────────────────────── Feature Engineering ───────────────────────────
def add_features(df):
    df = df.copy()
    eps = 1e-6

    df['pct_unemp']  = df['Estimated Unemployed'] / (df['Total Population'] + eps)
    df['pct_limm']   = df['Estimated Limited English Proficiency'] / (df['Total Population'] + eps)
    df['pct_noveh']  = df['Estimated No Vehicle'] / (df['Total Population'] + eps)
    df['pct_mobile'] = df['Estimated Mobile Homes'] / (df['Estimated Housing Units'] + eps)
    df['pct_crowd']  = df['Estimated Crowded Housing'] / (df['Estimated Housing Units'] + eps)

    lows  = ['Income Bracket 1', 'Income Bracket 2', 'Income Bracket 3']
    highs = ['Income Bracket 14', 'Income Bracket 15', 'Income Bracket 16']
    df['low_inc_pct']  = df[lows].sum(axis=1) / (df['Household Income Distribution'] + eps)
    df['high_inc_pct'] = df[highs].sum(axis=1) / (df['Household Income Distribution'] + eps)
    df['inc_ineq']     = df['high_inc_pct'] / (df['low_inc_pct'] + eps)

    df['log_pop']    = np.log1p(df['Total Population'])
    df['log_medinc'] = np.log1p(df['Median Household Income'])
    df['pop_poverty'] = df['Total Population'] * df['low_inc_pct']

    df['Begin Year-Month'] = pd.to_datetime(df['Begin Year-Month'], format='%Y%m')
    df['month'] = df['Begin Year-Month'].dt.month

    return df

features = [
    'log_pop', 'log_medinc',
    'pct_unemp', 'pct_limm', 'pct_noveh',
    'low_inc_pct', 'pop_poverty', 'high_inc_pct',
    'ppt_mean', 'tmean_mean'
]

# ─────────────────────────── Main Pipeline ───────────────────────────
# Load and rename
full = pd.read_csv("cleaned_hurricane_damage_data.csv")
full = full.rename(columns=census_svi_column_mapping)
full = full.loc[:, ~full.columns.duplicated()]

# Remove rows with no damage
full = full[full['Damage to Property'] != 0].dropna(subset=['Damage to Property'])

# Add derived features
full = add_features(full)

# Group target (sum) and features (mean)
y_group = full.groupby(['Year', 'State-County FIPS Code'])[['Damage to Property']].sum().reset_index()
x_group = full.groupby(['Year', 'State-County FIPS Code'])[features + ['Latitude', 'Longitude']].mean().reset_index()

# Merge target and predictors
full_data = y_group.merge(x_group, on=['Year', 'State-County FIPS Code'])

# ─────────────────────────── Train-Test Split ───────────────────────────
train_full = full_data[full_data['Year'] < 2020].reset_index(drop=True)
test_full  = full_data[full_data['Year'] == 2020].reset_index(drop=True)

# ─────────────────────────── Spatial Clustering ───────────────────────────
coords = train_full[['Latitude', 'Longitude']].values
kmeans = KMeans(n_clusters=10, random_state=0).fit(coords)
train_full['spatial_cluster'] = kmeans.labels_

print(f"Train size: {train_full.shape[0]} | Test size: {test_full.shape[0]}")


Train size: 746 | Test size: 199


  full = pd.read_csv("cleaned_hurricane_damage_data.csv")


In [5]:
def prepare_Xy(df):
    df = df.dropna(subset=features + ['Damage to Property'])
    X = df[features].values
    y = np.log1p(df['Damage to Property'].values)
    return X, y
    

# Main Loop

In [6]:
import numpy as np
import pandas as pd
import torch
import pyro
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam
import pyro.distributions as dist
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.ticker as mticker

def safe_metrics(y_true, y_pred):
    mask = (
        (~np.isnan(y_true)) & (~np.isnan(y_pred)) &
        (~np.isinf(y_true)) & (~np.isinf(y_pred))
    )
    y_true_clean = y_true[mask]
    y_pred_clean = y_pred[mask]
    mae = mean_absolute_error(y_true_clean, y_pred_clean)
    rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
    return mae, rmse

def nested_spatial_cv_evaluation(train_data, features, outer_k=10, inner_k=4, n_epochs=2000):
    outer_coords = train_data[['Latitude', 'Longitude']].values
    outer_kmeans = KMeans(n_clusters=outer_k, random_state=0).fit(outer_coords)
    train_data['outer_cluster'] = outer_kmeans.labels_

    outer_results = []
    all_error_dfs = []

    for outer_fold in range(outer_k):
        print(f"\n=== OUTER Fold {outer_fold+1}/{outer_k} ===")
        outer_val = train_data[train_data['outer_cluster'] == outer_fold]
        outer_train = train_data[train_data['outer_cluster'] != outer_fold]

        inner_coords = outer_train[['Latitude', 'Longitude']].values
        inner_kmeans = KMeans(n_clusters=inner_k, random_state=0).fit(inner_coords)
        outer_train['inner_cluster'] = inner_kmeans.labels_

        best_model_params = None
        best_val_loss = float('inf')

        for inner_fold in range(inner_k):
            print(f"\n--- INNER Fold {inner_fold+1}/{inner_k} ---")
            inner_val = outer_train[outer_train['inner_cluster'] == inner_fold]
            inner_train = outer_train[outer_train['inner_cluster'] != inner_fold]

            X_train = inner_train[features].values
            y_train = np.log1p(inner_train['Damage to Property'].values)
            X_val = inner_val[features].values
            y_val = np.log1p(inner_val['Damage to Property'].values)

            scaler = StandardScaler().fit(X_train)
            X_train_t = torch.tensor(scaler.transform(X_train), dtype=torch.float32)
            X_val_t = torch.tensor(scaler.transform(X_val), dtype=torch.float32)
            y_train_t = torch.tensor(y_train, dtype=torch.float32)
            y_val_t = torch.tensor(y_val, dtype=torch.float32)

            def model(X, y=None):
                n_features = X.shape[1]
                intercept = pyro.sample("intercept", dist.Normal(0., 10.))
                weights = pyro.sample("weights", dist.Normal(torch.zeros(n_features), 10 * torch.ones(n_features)).to_event(1))
                sigma = pyro.sample("sigma", dist.HalfNormal(10.))
                mu = intercept + (X * weights).sum(-1)
                with pyro.plate("data", X.shape[0]):
                    pyro.sample("obs", dist.Normal(mu, sigma), obs=y)

            def guide(X, y=None):
                n_features = X.shape[1]
                pyro.sample("intercept", dist.Normal(pyro.param("intercept_loc", torch.tensor(0.0)),
                                                     pyro.param("intercept_scale", torch.tensor(1.0), constraint=dist.constraints.positive)))
                pyro.sample("weights", dist.Normal(pyro.param("weights_loc", torch.zeros(n_features)),
                                                   pyro.param("weights_scale", torch.ones(n_features), constraint=dist.constraints.positive)).to_event(1))
                pyro.sample("sigma", dist.HalfNormal(pyro.param("sigma_loc", torch.tensor(1.0), constraint=dist.constraints.positive)))

            pyro.clear_param_store()
            svi = SVI(model, guide, Adam({"lr": 0.01}), loss=Trace_ELBO())
            train_losses = []
            val_losses = []

            for ep in range(1, n_epochs + 1):
                train_loss = svi.step(X_train_t, y_train_t)
                val_loss = svi.evaluate_loss(X_val_t, y_val_t)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                if ep % 500 == 0 or ep == n_epochs:
                    print(f"Epoch {ep}: Train Loss={train_loss:.1f}, Val Loss={val_loss:.1f}")

            final_val_loss = svi.evaluate_loss(X_val_t, y_val_t)
            if final_val_loss < best_val_loss:
                best_val_loss = final_val_loss
                best_model_params = {
                    'scaler': scaler,
                    'weights': pyro.param("weights_loc").detach().numpy(),
                    'intercept': pyro.param("intercept_loc").item()
                }
                best_train_losses = train_losses
                best_val_losses = val_losses


        # Evaluate on outer_val using best model
        X_outer = outer_val[features].values
        y_outer_log = np.log1p(outer_val['Damage to Property'].values)
        scaler = best_model_params['scaler']
        X_outer_scaled = scaler.transform(X_outer)
        y_pred_log = best_model_params['intercept'] + X_outer_scaled @ best_model_params['weights']
        y_pred_orig = np.expm1(y_pred_log)
        y_true_orig = np.expm1(y_outer_log)

        mae_log, rmse_log = safe_metrics(y_outer_log, y_pred_log)
        mae_orig, rmse_orig = safe_metrics(y_true_orig, y_pred_orig)

        error_df = pd.DataFrame({
            'true_log': y_outer_log,
            'pred_log': y_pred_log,
            'true_orig': y_true_orig,
            'pred_orig': y_pred_orig,
            'Latitude': outer_val['Latitude'].values,
            'Longitude': outer_val['Longitude'].values,
            'outer_fold': outer_fold

        })

        error_df['abs_error_log'] = np.abs(error_df['true_log'] - error_df['pred_log'])
        error_df['rmse_error_log'] = (error_df['true_log'] - error_df['pred_log']) ** 2
        error_df['abs_error_orig'] = np.abs(error_df['true_orig'] - error_df['pred_orig'])
        error_df['rmse_error_orig'] = (error_df['true_orig'] - error_df['pred_orig']) ** 2

        all_error_dfs.append(error_df)

        outer_results.append({
        'fold': outer_fold,
        'mae_log': mae_log,
        'rmse_log': rmse_log,
        'mae_orig': mae_orig,
        'rmse_orig': rmse_orig,
        'samples': len(y_true_orig),
        'train_losses': best_train_losses,
        'val_losses': best_val_losses
    })

        # Plot and save map
        def save_cartopy_scatter(df, value_col, title, filename, cmap, vmax):
            fig = plt.figure(figsize=(10, 7))
            ax = plt.axes(projection=ccrs.PlateCarree())
            ax.set_extent([df['Longitude'].min() - 1, df['Longitude'].max() + 1,
                           df['Latitude'].min() - 1, df['Latitude'].max() + 1], crs=ccrs.PlateCarree())
            ax.add_feature(cfeature.STATES.with_scale('50m'), edgecolor='black', linewidth=0.6)
            sc = ax.scatter(df['Longitude'], df['Latitude'],
                            c=df[value_col], cmap=cmap, s=30, alpha=0.85,
                            transform=ccrs.PlateCarree(), vmin=0, vmax=vmax)
            cbar = plt.colorbar(sc, ax=ax, orientation='vertical', shrink=0.8, pad=0.02)
            cbar.set_label(title, fontsize=12)
            cbar.ax.tick_params(labelsize=10)
            ax.set_title(title, fontsize=14)
            ax.set_xlabel("Longitude")
            ax.set_ylabel("Latitude")
            plt.tight_layout()
            plt.savefig(filename, dpi=300, bbox_inches='tight')
            plt.close()

        vmax_log = max(error_df['abs_error_log'].max(), np.sqrt(error_df['rmse_error_log']).max())
        vmax_orig = max(error_df['abs_error_orig'].max(), np.sqrt(error_df['rmse_error_orig']).max())

        save_cartopy_scatter(error_df, 'abs_error_log', f'Fold {outer_fold} - Log MAE', f'M4_fold{outer_fold}_log_mae.png', 'viridis', vmax_log)
        save_cartopy_scatter(error_df, 'rmse_error_log', f'Fold {outer_fold} - Log RMSE', f'M4_fold{outer_fold}_log_rmse.png', 'viridis', vmax_log)
        save_cartopy_scatter(error_df, 'abs_error_orig', f'Fold {outer_fold} - MAE', f'M4_fold{outer_fold}_mae.png', 'plasma', vmax_orig)
        save_cartopy_scatter(error_df, 'rmse_error_orig', f'Fold {outer_fold} - RMSE', f'M4_fold{outer_fold}_rmse.png', 'plasma', vmax_orig)

    results_df = pd.DataFrame(outer_results)
    full_error_df = pd.concat(all_error_dfs, ignore_index=True)
    return results_df, full_error_df


results_df_M4, full_error_df_M4 = nested_spatial_cv_evaluation(
    train_data=train_full,
    features=features,
    outer_k=8,
    inner_k=4,
    n_epochs=2000
)


=== OUTER Fold 1/8 ===

--- INNER Fold 1/4 ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outer_train['inner_cluster'] = inner_kmeans.labels_


Epoch 500: Train Loss=2732071.8, Val Loss=6041.4
Epoch 1000: Train Loss=26464.1, Val Loss=10935.1
Epoch 1500: Train Loss=65671.5, Val Loss=11982.8
Epoch 2000: Train Loss=66312364.8, Val Loss=830.6

--- INNER Fold 2/4 ---
Epoch 500: Train Loss=201232.5, Val Loss=16295.3
Epoch 1000: Train Loss=15496833.2, Val Loss=7889.7
Epoch 1500: Train Loss=244506.8, Val Loss=9297.7
Epoch 2000: Train Loss=496208.2, Val Loss=2395.6

--- INNER Fold 3/4 ---
Epoch 500: Train Loss=1426559.1, Val Loss=123855.5
Epoch 1000: Train Loss=148656.4, Val Loss=7399.2
Epoch 1500: Train Loss=299136.2, Val Loss=8921.9
Epoch 2000: Train Loss=9107.4, Val Loss=2129.7

--- INNER Fold 4/4 ---
Epoch 500: Train Loss=8022.0, Val Loss=24801.2
Epoch 1000: Train Loss=4933.7, Val Loss=32296.9
Epoch 1500: Train Loss=1118756.9, Val Loss=4147.2
Epoch 2000: Train Loss=8752.7, Val Loss=36584.3

=== OUTER Fold 2/8 ===

--- INNER Fold 1/4 ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outer_train['inner_cluster'] = inner_kmeans.labels_


Epoch 500: Train Loss=29116.6, Val Loss=7769.1
Epoch 1000: Train Loss=330229784.7, Val Loss=11751.6
Epoch 1500: Train Loss=2601166.1, Val Loss=113444.7
Epoch 2000: Train Loss=1207359.9, Val Loss=7101.5

--- INNER Fold 2/4 ---
Epoch 500: Train Loss=517622.3, Val Loss=770659.5
Epoch 1000: Train Loss=3589.0, Val Loss=3905.1
Epoch 1500: Train Loss=13670.0, Val Loss=1548.9
Epoch 2000: Train Loss=7802.7, Val Loss=1730.4

--- INNER Fold 3/4 ---
Epoch 500: Train Loss=34572.3, Val Loss=3467.5
Epoch 1000: Train Loss=44966.2, Val Loss=7127.1
Epoch 1500: Train Loss=4716.0, Val Loss=5087.4
Epoch 2000: Train Loss=98197166.7, Val Loss=65580.1

--- INNER Fold 4/4 ---
Epoch 500: Train Loss=120207.2, Val Loss=3463835.5
Epoch 1000: Train Loss=3312.5, Val Loss=90917.0
Epoch 1500: Train Loss=3815.7, Val Loss=378.5
Epoch 2000: Train Loss=24824.8, Val Loss=1041.0

=== OUTER Fold 3/8 ===

--- INNER Fold 1/4 ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outer_train['inner_cluster'] = inner_kmeans.labels_


Epoch 500: Train Loss=4408.6, Val Loss=23639.6
Epoch 1000: Train Loss=235177.4, Val Loss=21984.9
Epoch 1500: Train Loss=2919.4, Val Loss=376517.6
Epoch 2000: Train Loss=6182.0, Val Loss=8279.9

--- INNER Fold 2/4 ---
Epoch 500: Train Loss=4046.1, Val Loss=10192.9
Epoch 1000: Train Loss=37753.7, Val Loss=8767.0
Epoch 1500: Train Loss=5601.7, Val Loss=6976.0
Epoch 2000: Train Loss=2382.9, Val Loss=2591.8

--- INNER Fold 3/4 ---
Epoch 500: Train Loss=1319499.3, Val Loss=10349.4
Epoch 1000: Train Loss=11074.2, Val Loss=2168313.1
Epoch 1500: Train Loss=2832980.5, Val Loss=18890.0
Epoch 2000: Train Loss=11461.5, Val Loss=6508.1

--- INNER Fold 4/4 ---
Epoch 500: Train Loss=116133.9, Val Loss=342598.0
Epoch 1000: Train Loss=3302.5, Val Loss=537.6
Epoch 1500: Train Loss=31632.7, Val Loss=481.9
Epoch 2000: Train Loss=4209.3, Val Loss=399.4

=== OUTER Fold 4/8 ===

--- INNER Fold 1/4 ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outer_train['inner_cluster'] = inner_kmeans.labels_


Epoch 500: Train Loss=4835.0, Val Loss=35331928.8
Epoch 1000: Train Loss=153264.4, Val Loss=52272.1
Epoch 1500: Train Loss=4728.9, Val Loss=692.1
Epoch 2000: Train Loss=3602.8, Val Loss=673.8

--- INNER Fold 2/4 ---
Epoch 500: Train Loss=27886.9, Val Loss=68513.3
Epoch 1000: Train Loss=8708.5, Val Loss=1908.2
Epoch 1500: Train Loss=5874.7, Val Loss=5165.3
Epoch 2000: Train Loss=8554.3, Val Loss=10929.5

--- INNER Fold 3/4 ---
Epoch 500: Train Loss=701488.9, Val Loss=4448.6
Epoch 1000: Train Loss=21033.6, Val Loss=17406.5
Epoch 1500: Train Loss=4548.9, Val Loss=6524.9
Epoch 2000: Train Loss=15796754.4, Val Loss=728.6

--- INNER Fold 4/4 ---
Epoch 500: Train Loss=435688.0, Val Loss=254924.1
Epoch 1000: Train Loss=12207.6, Val Loss=22310.8
Epoch 1500: Train Loss=14735.5, Val Loss=3972206.1
Epoch 2000: Train Loss=8022.7, Val Loss=107639.9

=== OUTER Fold 5/8 ===

--- INNER Fold 1/4 ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outer_train['inner_cluster'] = inner_kmeans.labels_


Epoch 500: Train Loss=5356.0, Val Loss=4615.8
Epoch 1000: Train Loss=11488.0, Val Loss=1908.1
Epoch 1500: Train Loss=2887.9, Val Loss=1317.3
Epoch 2000: Train Loss=11176.5, Val Loss=33393.9

--- INNER Fold 2/4 ---
Epoch 500: Train Loss=16527.1, Val Loss=403793.5
Epoch 1000: Train Loss=27631.1, Val Loss=1244.1
Epoch 1500: Train Loss=11718.7, Val Loss=1073.4
Epoch 2000: Train Loss=9823.6, Val Loss=1652.0

--- INNER Fold 3/4 ---
Epoch 500: Train Loss=7661.6, Val Loss=3297.6
Epoch 1000: Train Loss=6391315.3, Val Loss=4303.4
Epoch 1500: Train Loss=6391.7, Val Loss=1306.1
Epoch 2000: Train Loss=1957.6, Val Loss=1063.5

--- INNER Fold 4/4 ---
Epoch 500: Train Loss=68144.9, Val Loss=17441.7
Epoch 1000: Train Loss=256443.1, Val Loss=3803.8
Epoch 1500: Train Loss=4326733.7, Val Loss=2657.5
Epoch 2000: Train Loss=7962.5, Val Loss=2033.4

=== OUTER Fold 6/8 ===

--- INNER Fold 1/4 ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outer_train['inner_cluster'] = inner_kmeans.labels_


Epoch 500: Train Loss=9415.9, Val Loss=2042.3
Epoch 1000: Train Loss=21420.9, Val Loss=80891.7
Epoch 1500: Train Loss=13679.0, Val Loss=482.2
Epoch 2000: Train Loss=15059.1, Val Loss=39387.8

--- INNER Fold 2/4 ---
Epoch 500: Train Loss=150263.9, Val Loss=45931.0
Epoch 1000: Train Loss=2480.4, Val Loss=105244.7
Epoch 1500: Train Loss=3487.8, Val Loss=4027.9
Epoch 2000: Train Loss=5024.4, Val Loss=8985.0

--- INNER Fold 3/4 ---
Epoch 500: Train Loss=19259.7, Val Loss=78845.7
Epoch 1000: Train Loss=860904.5, Val Loss=637462.5
Epoch 1500: Train Loss=24078.6, Val Loss=2299.4
Epoch 2000: Train Loss=4677.3, Val Loss=711.0

--- INNER Fold 4/4 ---
Epoch 500: Train Loss=387989.4, Val Loss=5771465715.0
Epoch 1000: Train Loss=2043.0, Val Loss=284389.5
Epoch 1500: Train Loss=153976.5, Val Loss=13924.0
Epoch 2000: Train Loss=620830.2, Val Loss=1598.5

=== OUTER Fold 7/8 ===

--- INNER Fold 1/4 ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outer_train['inner_cluster'] = inner_kmeans.labels_


Epoch 500: Train Loss=10455.3, Val Loss=6615.8
Epoch 1000: Train Loss=3849.3, Val Loss=3130226.6
Epoch 1500: Train Loss=9006.3, Val Loss=8936.5
Epoch 2000: Train Loss=1090351.3, Val Loss=980811.5

--- INNER Fold 2/4 ---
Epoch 500: Train Loss=146330.4, Val Loss=3874.8
Epoch 1000: Train Loss=14767.4, Val Loss=16953.4
Epoch 1500: Train Loss=61729.8, Val Loss=534067.3
Epoch 2000: Train Loss=104099.0, Val Loss=1421.8

--- INNER Fold 3/4 ---
Epoch 500: Train Loss=131796.9, Val Loss=10060.6
Epoch 1000: Train Loss=10617.1, Val Loss=16488.1
Epoch 1500: Train Loss=11882.2, Val Loss=86218.7
Epoch 2000: Train Loss=52033.4, Val Loss=2380.2

--- INNER Fold 4/4 ---
Epoch 500: Train Loss=66773.7, Val Loss=2172.6
Epoch 1000: Train Loss=321117.2, Val Loss=4054.4
Epoch 1500: Train Loss=10655.2, Val Loss=5358.0
Epoch 2000: Train Loss=25731.3, Val Loss=5488.7

=== OUTER Fold 8/8 ===

--- INNER Fold 1/4 ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outer_train['inner_cluster'] = inner_kmeans.labels_


Epoch 500: Train Loss=11753.4, Val Loss=306365.5
Epoch 1000: Train Loss=4872.4, Val Loss=1586.3
Epoch 1500: Train Loss=3786.3, Val Loss=12725.1
Epoch 2000: Train Loss=15091.6, Val Loss=38095.2

--- INNER Fold 2/4 ---
Epoch 500: Train Loss=1166972.4, Val Loss=16021.9
Epoch 1000: Train Loss=265482.2, Val Loss=1953.6
Epoch 1500: Train Loss=8764.5, Val Loss=649649.6
Epoch 2000: Train Loss=7716.8, Val Loss=64533.6

--- INNER Fold 3/4 ---
Epoch 500: Train Loss=172581.6, Val Loss=6999.5
Epoch 1000: Train Loss=58504.8, Val Loss=5047.9
Epoch 1500: Train Loss=509330.1, Val Loss=4426.6
Epoch 2000: Train Loss=257922.0, Val Loss=12345.2

--- INNER Fold 4/4 ---
Epoch 500: Train Loss=250162.3, Val Loss=2362.5
Epoch 1000: Train Loss=34012.2, Val Loss=3301.2
Epoch 1500: Train Loss=74873.8, Val Loss=5949.5
Epoch 2000: Train Loss=39605.0, Val Loss=2275.2


In [7]:
import os

# Create the csv directory if it doesn't exist
os.makedirs('csv', exist_ok=True)

# Save the DataFrames
results_df_M4.to_csv('csv/results_df_M4.csv', index=True)
full_error_df_M4.to_csv('csv/full_error_df_M4.csv', index=True)

In [8]:
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

def plot_combined_log_mae(error_df, output_file="M4_combined_log_mae.png"):
    vmax = error_df['abs_error_log'].max()
    
    fig = plt.figure(figsize=(10, 7))
    ax = plt.axes(projection=ccrs.PlateCarree())
    ax.set_extent([error_df['Longitude'].min() - 1, error_df['Longitude'].max() + 1,
                   error_df['Latitude'].min() - 1, error_df['Latitude'].max() + 1])
    ax.add_feature(cfeature.STATES.with_scale('50m'), edgecolor='black', linewidth=0.6)

    sc = ax.scatter(error_df['Longitude'], error_df['Latitude'],
                    c=error_df['abs_error_log'], cmap='viridis', s=30, alpha=0.8,
                    transform=ccrs.PlateCarree(), vmin=0, vmax=vmax)

    cbar = plt.colorbar(sc, ax=ax, orientation='vertical', shrink=0.8, pad=0.02)
    cbar.set_label("Log MAE", fontsize=12)
    ax.set_title("Log MAE Error (All Outer Folds Combined)", fontsize=14)
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved combined error map: {output_file}")

plot_combined_log_mae(full_error_df_M4)


Saved combined error map: M4_combined_log_mae.png


In [9]:
# Comment out the following to look at the loss curves for outer clusters
# def plot_loss_curves(outer_results, output_prefix="fold"):
#     for result in outer_results:
#         train_losses = result['train_losses']
#         val_losses = result['val_losses']
#         fold = result['fold']

#         plt.figure(figsize=(8, 5))
#         plt.plot(train_losses, label='Train Loss', color='blue')
#         plt.plot(val_losses, label='Validation Loss', color='orange')
#         plt.xlabel("Epoch")
#         plt.ylabel("ELBO Loss")
#         plt.title(f"Train vs Validation Loss - Outer Fold {fold}")
#         plt.legend()
#         plt.grid(True)
#         plt.tight_layout()
#         fname = f"M4_{output_prefix}{fold}_loss_curve.png"
#         plt.savefig(fname, dpi=300)
#         plt.close()
#         print(f"Saved: {fname}")
        
# plot_loss_curves(results_df.to_dict('records'))
