In [24]:
# =============================================================================
# Cell 1: Imports
# =============================================================================
import os
import json
import numpy as np
import pandas as pd
import torch
import optuna
from pprint import pprint # For pretty printing dicts
import copy

# Import project modules
# It's good practice to reload modules if you make changes during development
import importlib
import utils
import data_handling
import core_nn
import hpo
import model_handlers
import metrics

importlib.reload(utils)
importlib.reload(data_handling)
importlib.reload(core_nn)
importlib.reload(hpo)
importlib.reload(model_handlers)
importlib.reload(metrics)

# Import specific classes/functions for convenience
from data_handling import DataHandler
from hpo import HyperparameterConfig, HyperparameterTuner, ObjectiveNN, ObjectiveXGBoost
from model_handlers import RidgeModel, XGBoostModel, NNModel
from metrics import evaluate_predictions

print("Imports successful.")
print(f"Using device: {utils.DEVICE}")

Using device: mps
Imports successful.
Using device: mps


In [17]:
# =============================================================================
# Cell 2: Global Constants and Random Seed
# =============================================================================

# --- Configuration ---
SEED = 42
TEST_YEAR = 2020 # Year held out for final testing
BATCH_SIZE = 128 # Batch size for DataLoaders

# --- Paths ---
# Assumes notebook is run from the project root directory
DATA_DIR = "./data"
MODELS_DIR = "./models"
RESULTS_DIR = "./results"
DATA_CSV_PATH = os.path.join(DATA_DIR, 'final_dataset.csv')
OPTUNA_DB_PATH = os.path.join(RESULTS_DIR, "hpo_studies.db") # SQLite path for Optuna persistence

# Ensure directories exist
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# --- Base Optuna Study Name ---
BASE_STUDY_NAME = f"election_pred_{TEST_YEAR}"

# --- Set Random Seeds ---
np.random.seed(SEED)
torch.manual_seed(SEED)
if utils.DEVICE.type == 'cuda':
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED) # if using multi-GPU
# Optional: For full reproducibility, disable certain cuDNN algorithms
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

print("Constants and random seeds set.")
print(f"Data Path: {DATA_CSV_PATH}")
print(f"Models Dir: {MODELS_DIR}")
print(f"Results Dir: {RESULTS_DIR}")
print(f"Optuna DB: {OPTUNA_DB_PATH}")
print(f"Test Year: {TEST_YEAR}")

Constants and random seeds set.
Data Path: ./data/final_dataset.csv
Models Dir: ./models
Results Dir: ./results
Optuna DB: ./results/hpo_studies.db
Test Year: 2020


In [18]:
# =============================================================================
# Cell 3: Instantiate DataHandler and HyperparameterConfig
# =============================================================================

# Instantiate DataHandler (loads data, creates splits, scalers, dataloaders)
dh = DataHandler(
    data_csv_path=DATA_CSV_PATH,
    test_year=TEST_YEAR,
    batch_size=BATCH_SIZE
)

# Instantiate HyperparameterConfig (holds default search spaces)
hp_config = HyperparameterConfig()

print("\nDataHandler and HyperparameterConfig instantiated.")
print(f"Input dimension for models: {dh.input_dim}")

# Display default XGBoost space as an example
hp_config.display_space("XGBoost")

DataHandler initialized:
  Data path: ./data/final_dataset.csv
  Using 115 features.
  Test year: 2020
  Batch size: 128
  Datasets and DataLoaders created for CV and final training.

DataHandler and HyperparameterConfig instantiated.
Input dimension for models: 115

--- Active Search Space for 'XGBoost' ---
{
  "eta": {
    "type": "float",
    "low": 0.01,
    "high": 0.3,
    "log": true
  },
  "max_depth": {
    "type": "int",
    "low": 3,
    "high": 10,
    "log": false
  },
  "subsample": {
    "type": "float",
    "low": 0.5,
    "high": 1.0,
    "log": false
  },
  "colsample_bytree": {
    "type": "float",
    "low": 0.5,
    "high": 1.0,
    "log": false
  },
  "gamma": {
    "type": "float",
    "low": 0.0,
    "high": 5.0,
    "log": false
  },
  "lambda": {
    "type": "float",
    "low": 0.01,
    "high": 10.0,
    "log": true
  },
  "alpha": {
    "type": "float",
    "low": 0.01,
    "high": 10.0,
    "log": true
  }
}
-------------------------------------


In [19]:
# =============================================================================
# Cell 4: Ridge - Cross-Validation (Grid Search)
# =============================================================================
print("\n--- Starting Ridge Cross-Validation ---")

# Instantiate Ridge Model handler
ridge_model = RidgeModel(model_dir=MODELS_DIR, results_dir=RESULTS_DIR)

# Define the parameter grid for Ridge's alpha
# Note: This is passed explicitly now, not a default in the class
ridge_param_grid = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0]
print(f"Ridge alpha grid: {ridge_param_grid}")

# Run cross-validation (uses internal weighted MSE grid search)
ridge_model.cross_validate(dh=dh, param_grid=ridge_param_grid)

print("\n--- Finished Ridge Cross-Validation ---")


--- Starting Ridge Cross-Validation ---
Ridge alpha grid: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0]

--- Starting Cross-Validation for RIDGE ---
  Testing alpha = 0.001:
 Fold 1 - Train: [2008, 2012] - Val: 2016 - Val Loss (W-MSE): 0.001199
 Fold 2 - Train: [2008, 2016] - Val: 2012 - Val Loss (W-MSE): 0.001349
 Fold 3 - Train: [2012, 2016] - Val: 2008 - Val Loss (W-MSE): 0.002165
    Avg Val Loss (W-MSE): 0.001571
  Testing alpha = 0.005:
 Fold 1 - Train: [2008, 2012] - Val: 2016 - Val Loss (W-MSE): 0.001178
 Fold 2 - Train: [2008, 2016] - Val: 2012 - Val Loss (W-MSE): 0.001376
 Fold 3 - Train: [2012, 2016] - Val: 2008 - Val Loss (W-MSE): 0.002118
    Avg Val Loss (W-MSE): 0.001557
  Testing alpha = 0.01:
 Fold 1 - Train: [2008, 2012] - Val: 2016 - Val Loss (W-MSE): 0.001173
 Fold 2 - Train: [2008, 2016] - Val: 2012 - Val Loss (W-MSE): 0.001375
 Fold 3 - Train: [2012, 2016] - Val: 2008 - Val Loss (W-MSE): 0.002075
    Avg Val Loss (W-MSE): 0.001541
  Testing alpha = 0.05:
 Fold 1

In [20]:
# =============================================================================
# Cell 5: Ridge - Final Training
# =============================================================================
print("\n--- Starting Ridge Final Training ---")

# Instantiate handler again (or reuse if previous cell run in sequence)
ridge_model = RidgeModel(model_dir=MODELS_DIR, results_dir=RESULTS_DIR)

# Train the final model using the best alpha found during CV
# The handler loads the best alpha from the saved CSV internally
trained_ridge = ridge_model.train_final_model(dh=dh)

print(f"Trained Ridge Model Alpha: {ridge_model.best_alpha}") # Access learned alpha
print("\n--- Finished Ridge Final Training ---")


--- Starting Ridge Final Training ---

--- Starting Final Model Training for RIDGE ---
Using best alpha: 0.05
Loaded final training data: 9270 samples.
Ridge model fitting complete.
Saved final trained Ridge model to: ./models/ridge_final_model.joblib
--- Finished Final Model Training for RIDGE ---
Trained Ridge Model Alpha: 0.05

--- Finished Ridge Final Training ---


In [22]:
# =============================================================================
# Cell 6: XGBoost - Cross-Validation (Optuna)
# =============================================================================
print("\n--- Starting XGBoost Cross-Validation (Optuna) ---")

# --- HPO Configuration ---
N_TRIALS_XGB = 30 # Number of Optuna trials (adjust as needed)
XGB_STUDY_NAME = f"{BASE_STUDY_NAME}_XGBoost"
# Use persistent storage (SQLite)
OPTUNA_STORAGE = f"sqlite:///{OPTUNA_DB_PATH}"

# Instantiate XGBoost Model handler
xgb_model = XGBoostModel(model_dir=MODELS_DIR, results_dir=RESULTS_DIR)

# Get default search space
xgb_search_space = hp_config.get_space("XGBoost")
print("XGBoost Search Space:")
pprint(xgb_search_space)

# Define fixed parameters for the Objective function
# These are not tuned by Optuna but needed for training within a trial
fixed_params_xgb = {
    'n_estimators_max_pruning': 150, # Max estimators during HPO (early stopping determines actual)
    'early_stopping_rounds': 25     # Patience for early stopping during HPO
}
print("\nXGBoost Fixed HPO Params:")
pprint(fixed_params_xgb)

# Instantiate XGBoost Objective
objective_xgb = ObjectiveXGBoost(
    data_handler=dh,
    search_space=xgb_search_space,
    fixed_params=fixed_params_xgb
)

# Instantiate Optuna Tuner
# Consider using a pruner like MedianPruner or SuccessiveHalvingPruner
tuner_xgb = HyperparameterTuner(
    study_name=XGB_STUDY_NAME,
    storage_path=OPTUNA_STORAGE,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5) # Example pruner
)

# Run Optuna cross-validation via the model handler
xgb_best_params = xgb_model.cross_validate(
    dh=dh,
    tuner=tuner_xgb,
    objective_xgb=objective_xgb,
    n_trials=N_TRIALS_XGB
)

print("\nBest XGBoost HPO Params Found:")
pprint(xgb_best_params)
# Optionally save Optuna plots
tuner_xgb.save_plots(results_dir=os.path.join(RESULTS_DIR, "optuna_plots_xgb"))

print("\n--- Finished XGBoost Cross-Validation (Optuna) ---")

[I 2025-04-28 10:18:37,007] A new study created in RDB with name: election_pred_2020_XGBoost
[I 2025-04-28 10:18:37,058] Trial 0 finished with value: inf and parameters: {'eta': 0.02249318799978266, 'max_depth': 9, 'subsample': 0.8576984159558247, 'colsample_bytree': 0.9728885904547244, 'gamma': 1.0006443209069638, 'lambda': 0.012703997493923093, 'alpha': 0.04731262377511004}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,083] Trial 1 finished with value: inf and parameters: {'eta': 0.08565084790293678, 'max_depth': 8, 'subsample': 0.6153183499828112, 'colsample_bytree': 0.6518853012774333, 'gamma': 1.3050969659968175, 'lambda': 0.0130566087654922, 'alpha': 0.1545768802817256}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,111] Trial 2 finished with value: inf and parameters: {'eta': 0.047741144264486904, 'max_depth': 4, 'subsample': 0.8205057484142027, 'colsample_bytree': 0.5724266338306842, 'gamma': 4.951262805631169, 'lambda': 2.824386878746777, 'alpha': 0.013926


--- Starting XGBoost Cross-Validation (Optuna) ---
XGBoost Search Space:
{'alpha': {'high': 10.0, 'log': True, 'low': 0.01, 'type': 'float'},
 'colsample_bytree': {'high': 1.0, 'log': False, 'low': 0.5, 'type': 'float'},
 'eta': {'high': 0.3, 'log': True, 'low': 0.01, 'type': 'float'},
 'gamma': {'high': 5.0, 'log': False, 'low': 0.0, 'type': 'float'},
 'lambda': {'high': 10.0, 'log': True, 'low': 0.01, 'type': 'float'},
 'max_depth': {'high': 10, 'log': False, 'low': 3, 'type': 'int'},
 'subsample': {'high': 1.0, 'log': False, 'low': 0.5, 'type': 'float'}}

XGBoost Fixed HPO Params:
{'early_stopping_rounds': 25, 'n_estimators_max_pruning': 150}

--- Starting Cross-Validation (Optuna) for XGBOOST ---

--- Starting Optuna HPO ---
Study Name: election_pred_2020_XGBoost, N Trials: 30, Pruner: MedianPruner
Direction: minimize, Storage: sqlite:///./results/hpo_studies.db


[I 2025-04-28 10:18:37,165] Trial 4 finished with value: inf and parameters: {'eta': 0.1741186618612467, 'max_depth': 9, 'subsample': 0.5782986897533768, 'colsample_bytree': 0.6875102682334095, 'gamma': 2.7996485160363687, 'lambda': 0.015968822179734297, 'alpha': 1.7888424561995238}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,194] Trial 5 finished with value: inf and parameters: {'eta': 0.026231885985259933, 'max_depth': 10, 'subsample': 0.669521171298662, 'colsample_bytree': 0.8074807591142663, 'gamma': 4.2817577525107575, 'lambda': 0.012873937985957906, 'alpha': 0.30233349711403756}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,219] Trial 6 finished with value: inf and parameters: {'eta': 0.03699890244669174, 'max_depth': 4, 'subsample': 0.8665765961325884, 'colsample_bytree': 0.8153650738183034, 'gamma': 2.248701949227787, 'lambda': 0.06073132108172165, 'alpha': 0.016388359659486557}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,244] Trial 7 finish



[I 2025-04-28 10:18:37,375] Trial 11 finished with value: inf and parameters: {'eta': 0.09885343236619061, 'max_depth': 8, 'subsample': 0.8265438250205207, 'colsample_bytree': 0.6020653581195021, 'gamma': 1.4075846209693343, 'lambda': 0.04298505336468727, 'alpha': 0.08558763370345739}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,413] Trial 12 finished with value: inf and parameters: {'eta': 0.08053523399655324, 'max_depth': 8, 'subsample': 0.9356582630710055, 'colsample_bytree': 0.5023821213472339, 'gamma': 1.3344441797279611, 'lambda': 0.012680850815048215, 'alpha': 0.12184965642561894}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,448] Trial 13 finished with value: inf and parameters: {'eta': 0.11898116320355681, 'max_depth': 7, 'subsample': 0.7477903504661084, 'colsample_bytree': 0.6609097072681406, 'gamma': 0.7820710188119517, 'lambda': 0.14404612310022144, 'alpha': 0.02906196929587958}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,482] Trial 14 fi



[I 2025-04-28 10:18:37,593] Trial 17 finished with value: inf and parameters: {'eta': 0.03372004670560201, 'max_depth': 10, 'subsample': 0.5204525490459304, 'colsample_bytree': 0.871791291769709, 'gamma': 0.8672221413492733, 'lambda': 0.026270487557524803, 'alpha': 9.607390909241627}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,633] Trial 18 finished with value: inf and parameters: {'eta': 0.01591576884203744, 'max_depth': 3, 'subsample': 0.6990764719567624, 'colsample_bytree': 0.5103385995103238, 'gamma': 1.9774555744987299, 'lambda': 0.10210997677349146, 'alpha': 0.5430677731198936}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,668] Trial 19 finished with value: inf and parameters: {'eta': 0.24792711019383715, 'max_depth': 8, 'subsample': 0.6289621568380674, 'colsample_bytree': 0.7570430763346889, 'gamma': 1.6118728662352992, 'lambda': 0.028712742565711748, 'alpha': 0.03328536574463058}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,703] Trial 20 fini



[I 2025-04-28 10:18:37,848] Trial 21 finished with value: inf and parameters: {'eta': 0.053500985952765354, 'max_depth': 3, 'subsample': 0.7957334149534441, 'colsample_bytree': 0.578728542112022, 'gamma': 4.9525864713463905, 'lambda': 4.524742761260774, 'alpha': 0.010170699424689324}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,888] Trial 22 finished with value: inf and parameters: {'eta': 0.0440554238879644, 'max_depth': 5, 'subsample': 0.8468695346367127, 'colsample_bytree': 0.5826731999238747, 'gamma': 4.141057891064908, 'lambda': 1.351674220258137, 'alpha': 0.014587209186012811}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,923] Trial 23 finished with value: inf and parameters: {'eta': 0.08472833657469558, 'max_depth': 5, 'subsample': 0.9083759859145203, 'colsample_bytree': 0.5592122342516892, 'gamma': 0.9721348245663339, 'lambda': 1.0202330407597289, 'alpha': 0.02180307092081524}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:37,959] Trial 24 finished



[I 2025-04-28 10:18:38,042] Trial 26 finished with value: inf and parameters: {'eta': 0.01633190690052563, 'max_depth': 6, 'subsample': 0.8210744169903788, 'colsample_bytree': 0.6232643778759052, 'gamma': 2.159815320529793, 'lambda': 0.21223365121514104, 'alpha': 0.010123178030444817}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:38,089] Trial 27 finished with value: inf and parameters: {'eta': 0.010289676504867253, 'max_depth': 4, 'subsample': 0.7553684486397759, 'colsample_bytree': 0.7176859276797609, 'gamma': 4.903984418543287, 'lambda': 3.425342010586073, 'alpha': 0.09219370616864524}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:38,125] Trial 28 finished with value: inf and parameters: {'eta': 0.19413589944664883, 'max_depth': 8, 'subsample': 0.8981537314364815, 'colsample_bytree': 0.5607216470651115, 'gamma': 1.074311095577773, 'lambda': 2.291494687124606, 'alpha': 0.04631319034093134}. Best is trial 0 with value: inf.
[I 2025-04-28 10:18:38,162] Trial 29 finished


--- Optuna HPO Finished ---
Best Trial: 0, Best Value: inf
Best Parameters: {
  "eta": 0.02249318799978266,
  "max_depth": 9,
  "subsample": 0.8576984159558247,
  "colsample_bytree": 0.9728885904547244,
  "gamma": 1.0006443209069638,
  "lambda": 0.012703997493923093,
  "alpha": 0.04731262377511004
}
------------------------------

Best XGBOOST CV params found.
Best XGBOOST CV score (weighted_mse): inf
Saved best HPO parameters to: ./results/xgboost_best_params.json
--- Finished Cross-Validation for XGBOOST ---

Best XGBoost HPO Params Found:
{'alpha': 0.04731262377511004,
 'colsample_bytree': 0.9728885904547244,
 'eta': 0.02249318799978266,
 'gamma': 1.0006443209069638,
 'lambda': 0.012703997493923093,
 'max_depth': 9,
 'subsample': 0.8576984159558247}


[W 2025-04-28 10:18:41,330] Trial 0 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:18:41,330] Trial 1 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:18:41,331] Trial 2 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:18:41,331] Trial 3 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:18:41,331] Trial 4 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:18:41,331] Trial 5 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:18:41,331] Trial 6 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:18:41,331] Trial 7 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:18:41,332] Trial 8 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:18:41,332] Trial 9 i

Saving Optuna plots to: ./results/optuna_plots_xgb
  Could not save plot 'optimization_history': 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

  Could not save plot 'param_importances': 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

  Could not save plot 'slice_plot': 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

Finished saving plots.

--- Finished XGBoost Cross-Validation (Optuna) ---


In [None]:
# =============================================================================
# Cell 7: XGBoost - Final Training
# =============================================================================
print("\n--- Starting XGBoost Final Training ---")

# Instantiate handler again
xgb_model = XGBoostModel(model_dir=MODELS_DIR, results_dir=RESULTS_DIR)

# Define parameters for the final fit (e.g., potentially more patience)
# n_estimators will be set high, early stopping controls the effective number
final_fit_params_xgb = {
    'n_estimators': 1500,          # High value, overridden by early stopping
    'early_stopping_rounds': 50    # Patience for the final fit
}
print("XGBoost Final Fit Params:")
pprint(final_fit_params_xgb)

# Train the final model using best HPO parameters (loaded internally)
trained_xgb = xgb_model.train_final_model(
    dh=dh,
    final_fit_params=final_fit_params_xgb
)

print(f"\nTrained XGBoost Model Best Iteration: {trained_xgb.best_iteration}")
print("\n--- Finished XGBoost Final Training ---")

In [25]:
# =============================================================================
# Cell 8: NN (0 Hidden Layers) - Cross-Validation (Optuna)
# =============================================================================
print("\n--- Starting NN (0 Hidden Layers) Cross-Validation (Optuna) ---")

# --- Config ---
NUM_HIDDEN_LAYERS_NN0 = 0
N_TRIALS_NN0 = 30 # Adjust as needed
NN0_STUDY_NAME = f"{BASE_STUDY_NAME}_NN_0Layer"
OPTUNA_STORAGE = f"sqlite:///{OPTUNA_DB_PATH}" # Use same DB, different study name

# Instantiate NN Model handler
nn0_model = NNModel(input_dim=dh.input_dim, model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
nn0_model.MODEL_NAME = "NN0" # Customize name for saving

# Get base NN search space and customize for 0 layers
nn_search_space_base = hp_config.get_space("NN")
nn0_search_space = copy.deepcopy(nn_search_space_base)
# Remove layer unit suggestions as they are not needed
nn0_search_space.pop("n_units_l0", None)
nn0_search_space.pop("n_units_l1", None)
# Add other layer units here if they exist in the base config
print("NN (0 Layers) Search Space:")
pprint(nn0_search_space)

# Define fixed parameters for the Objective
fixed_params_nn0 = {
    'num_hidden_layers': NUM_HIDDEN_LAYERS_NN0, # Crucial: Specify architecture
    'pruning_epochs': [10, 25, 50],     # Epochs to check for pruning
    'patience': 15                      # Early stopping patience within HPO fold
}
print("\nNN (0 Layers) Fixed HPO Params:")
pprint(fixed_params_nn0)

# Instantiate NN Objective
objective_nn0 = ObjectiveNN(
    data_handler=dh,
    search_space=nn0_search_space,
    fixed_params=fixed_params_nn0
)

# Instantiate Optuna Tuner
tuner_nn0 = HyperparameterTuner(
    study_name=NN0_STUDY_NAME,
    storage_path=OPTUNA_STORAGE,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)

# Run Optuna cross-validation
nn0_best_params = nn0_model.cross_validate(
    dh=dh,
    tuner=tuner_nn0,
    objective_nn=objective_nn0,
    n_trials=N_TRIALS_NN0
)

print("\nBest NN (0 Layers) HPO Params Found:")
pprint(nn0_best_params)
# Optionally save Optuna plots
tuner_nn0.save_plots(results_dir=os.path.join(RESULTS_DIR, "optuna_plots_nn0"))

print("\n--- Finished NN (0 Hidden Layers) Cross-Validation (Optuna) ---")

[I 2025-04-28 10:27:58,855] A new study created in RDB with name: election_pred_2020_NN_0Layer



--- Starting NN (0 Hidden Layers) Cross-Validation (Optuna) ---
NN (0 Layers) Search Space:
{'activation': {'choices': ['ReLU', 'Tanh'], 'type': 'categorical'},
 'dropout_rate': {'high': 0.7, 'log': False, 'low': 0.0, 'type': 'float'},
 'learning_rate': {'high': 0.1, 'log': True, 'low': 1e-05, 'type': 'float'},
 'optimizer': {'choices': ['AdamW', 'Adam', 'SGD'], 'type': 'categorical'},
 'weight_decay': {'high': 0.1, 'log': False, 'low': 0.0, 'type': 'float'}}

NN (0 Layers) Fixed HPO Params:
{'num_hidden_layers': 0, 'patience': 15, 'pruning_epochs': [10, 25, 50]}

--- Starting Cross-Validation (Optuna) for NN0 ---

--- Starting Optuna HPO ---
Study Name: election_pred_2020_NN_0Layer, N Trials: 30, Pruner: MedianPruner
Direction: minimize, Storage: sqlite:///./results/hpo_studies.db



The reported value is ignored because this `step` 10 is already reported.


The reported value is ignored because this `step` 25 is already reported.


The reported value is ignored because this `step` 50 is already reported.

[I 2025-04-28 10:28:29,660] Trial 0 finished with value: 0.8453814832369485 and parameters: {'learning_rate': 0.0014398544281227442, 'weight_decay': 0.08324919213729007, 'dropout_rate': 0.5061931414196831, 'optimizer': 'AdamW', 'activation': 'ReLU'}. Best is trial 0 with value: 0.8453814832369485.
[I 2025-04-28 10:28:51,678] Trial 1 finished with value: 0.8514492964744568 and parameters: {'learning_rate': 0.0035303724522368963, 'weight_decay': 0.09375257828202677, 'dropout_rate': 0.6874196584023756, 'optimizer': 'Adam', 'activation': 'Tanh'}. Best is trial 0 with value: 0.8453814832369485.
[I 2025-04-28 10:29:08,558] Trial 2 finished with value: 0.8493451563517253 and parameters: {'learning_rate': 0.053656510186576724, 'weight_decay': 0.010424950007951551, 'drop


--- Optuna HPO Finished ---
Best Trial: 15, Best Value: 0.843291
Best Parameters: {
  "learning_rate": 0.007639990397085731,
  "weight_decay": 0.045359170224969145,
  "dropout_rate": 0.17829951430973262,
  "optimizer": "AdamW",
  "activation": "ReLU"
}
------------------------------

Best NN0 CV params found.
Best NN0 CV score (Loss): 0.843291
Saved best parameters and fixed HPO params to: ./results/nn_best_params.json
--- Finished Cross-Validation for NN0 ---

Best NN (0 Layers) HPO Params Found:
{'activation': 'ReLU',
 'dropout_rate': 0.17829951430973262,
 'learning_rate': 0.007639990397085731,
 'optimizer': 'AdamW',
 'weight_decay': 0.045359170224969145}


[W 2025-04-28 10:33:21,683] Trial 5 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:33:21,684] Trial 6 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:33:21,684] Trial 7 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:33:21,684] Trial 9 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:33:21,684] Trial 10 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:33:21,684] Trial 11 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:33:21,685] Trial 12 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:33:21,685] Trial 13 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:33:21,685] Trial 14 is omitted in visualization because its objective value is inf or nan.
[W 2025-04-28 10:33:21,685] Tria

Saving Optuna plots to: ./results/optuna_plots_nn0
  Could not save plot 'optimization_history': 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

  Could not save plot 'param_importances': 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

  Could not save plot 'slice_plot': 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

Finished saving plots.

--- Finished NN (0 Hidden Layers) Cross-Validation (Optuna) ---


In [26]:
# =============================================================================
# Cell 9: NN (0 Hidden Layers) - Final Training
# =============================================================================
print("\n--- Starting NN (0 Hidden Layers) Final Training ---")

# Instantiate handler again
nn0_model = NNModel(input_dim=dh.input_dim, model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
nn0_model.MODEL_NAME = "NN0" # Match name used in CV for loading params

# Define final training parameters
final_epochs_nn0 = 200 # Adjust as needed
final_patience_nn0 = 40 # Adjust as needed

# Train the final model using best HPO parameters (loaded internally)
trained_nn0 = nn0_model.train_final_model(
    dh=dh,
    final_epochs=final_epochs_nn0,
    final_patience=final_patience_nn0
)

print("\n--- Finished NN (0 Hidden Layers) Final Training ---")


--- Starting NN (0 Hidden Layers) Final Training ---

--- Starting Final Model Training for NN0 ---
Attempting to load best parameters from: ./results/nn_best_params.json
Loaded best_params and fixed_params from file.
Using best hyperparameters: {'learning_rate': 0.007639990397085731, 'weight_decay': 0.045359170224969145, 'dropout_rate': 0.17829951430973262, 'optimizer': 'AdamW', 'activation': 'ReLU'}
Starting final training for max 200 epochs (patience=40)...
  Epoch 10/200 - Train Loss: 0.840892 - Val Loss: 0.865626
  Epoch 20/200 - Train Loss: 0.840547 - Val Loss: 0.874310
  Epoch 30/200 - Train Loss: 0.839897 - Val Loss: 0.877461
  Epoch 40/200 - Train Loss: 0.840297 - Val Loss: 0.869872
  Early stopping triggered at epoch 45. Best Val Loss: 0.862305
Loaded best model state from epoch 5 (Val Loss: 0.862305).
Saving final NN0 model state_dict to: ./models/nn_final_model.pth
Model state_dict saved successfully.
Saving final training loss history to: ./results/nn_final_training_loss.

In [None]:
# =============================================================================
# Cell 10: NN (1 Hidden Layer) - Cross-Validation (Optuna)
# =============================================================================
print("\n--- Starting NN (1 Hidden Layer) Cross-Validation (Optuna) ---")
import copy # Ensure copy is imported if running cells independently

# --- Config ---
NUM_HIDDEN_LAYERS_NN1 = 1
N_TRIALS_NN1 = 50 # Adjust as needed
NN1_STUDY_NAME = f"{BASE_STUDY_NAME}_NN_1Layer"
OPTUNA_STORAGE = f"sqlite:///{OPTUNA_DB_PATH}"

# Instantiate NN Model handler
nn1_model = NNModel(input_dim=dh.input_dim, model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
nn1_model.MODEL_NAME = "NN1" # Customize name

# Get base NN search space and customize for 1 layer
nn_search_space_base = hp_config.get_space("NN")
nn1_search_space = copy.deepcopy(nn_search_space_base)
# Keep l0, remove l1
nn1_search_space.pop("n_units_l1", None)
# Add checks for other layer units if they exist
print("NN (1 Layer) Search Space:")
pprint(nn1_search_space)

# Define fixed parameters for the Objective
fixed_params_nn1 = {
    'num_hidden_layers': NUM_HIDDEN_LAYERS_NN1, # Specify architecture
    'pruning_epochs': [10, 25, 50],
    'patience': 15
}
print("\nNN (1 Layer) Fixed HPO Params:")
pprint(fixed_params_nn1)

# Instantiate NN Objective
objective_nn1 = ObjectiveNN(
    data_handler=dh,
    search_space=nn1_search_space,
    fixed_params=fixed_params_nn1
)

# Instantiate Optuna Tuner
tuner_nn1 = HyperparameterTuner(
    study_name=NN1_STUDY_NAME,
    storage_path=OPTUNA_STORAGE,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)

# Run Optuna cross-validation
nn1_best_params = nn1_model.cross_validate(
    dh=dh,
    tuner=tuner_nn1,
    objective_nn=objective_nn1,
    n_trials=N_TRIALS_NN1
)

print("\nBest NN (1 Layer) HPO Params Found:")
pprint(nn1_best_params)
tuner_nn1.save_plots(results_dir=os.path.join(RESULTS_DIR, "optuna_plots_nn1"))

print("\n--- Finished NN (1 Hidden Layer) Cross-Validation (Optuna) ---")

In [None]:
# =============================================================================
# Cell 11: NN (1 Hidden Layer) - Final Training
# =============================================================================
print("\n--- Starting NN (1 Hidden Layer) Final Training ---")

# Instantiate handler again
nn1_model = NNModel(input_dim=dh.input_dim, model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
nn1_model.MODEL_NAME = "NN1" # Match name

# Define final training parameters
final_epochs_nn1 = 200
final_patience_nn1 = 40

# Train the final model
trained_nn1 = nn1_model.train_final_model(
    dh=dh,
    final_epochs=final_epochs_nn1,
    final_patience=final_patience_nn1
)

print("\n--- Finished NN (1 Hidden Layer) Final Training ---")

In [None]:
# =============================================================================
# Cell 12: NN (2 Hidden Layers) - Cross-Validation (Optuna)
# =============================================================================
print("\n--- Starting NN (2 Hidden Layers) Cross-Validation (Optuna) ---")
import copy # Ensure copy is imported

# --- Config ---
NUM_HIDDEN_LAYERS_NN2 = 2
N_TRIALS_NN2 = 50 # Adjust
NN2_STUDY_NAME = f"{BASE_STUDY_NAME}_NN_2Layer"
OPTUNA_STORAGE = f"sqlite:///{OPTUNA_DB_PATH}"

# Instantiate NN Model handler
nn2_model = NNModel(input_dim=dh.input_dim, model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
nn2_model.MODEL_NAME = "NN2"

# Get base NN search space (assume it already includes l0, l1)
nn2_search_space = copy.deepcopy(hp_config.get_space("NN"))
# Remove suggestions for layers > 1 if they exist in base config
# nn2_search_space.pop("n_units_l2", None) ... etc
print("NN (2 Layers) Search Space:")
pprint(nn2_search_space)

# Define fixed parameters
fixed_params_nn2 = {
    'num_hidden_layers': NUM_HIDDEN_LAYERS_NN2, # Specify architecture
    'pruning_epochs': [10, 25, 50],
    'patience': 15
}
print("\nNN (2 Layers) Fixed HPO Params:")
pprint(fixed_params_nn2)

# Instantiate NN Objective
objective_nn2 = ObjectiveNN(
    data_handler=dh,
    search_space=nn2_search_space,
    fixed_params=fixed_params_nn2
)

# Instantiate Optuna Tuner
tuner_nn2 = HyperparameterTuner(
    study_name=NN2_STUDY_NAME,
    storage_path=OPTUNA_STORAGE,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)

# Run Optuna cross-validation
nn2_best_params = nn2_model.cross_validate(
    dh=dh,
    tuner=tuner_nn2,
    objective_nn=objective_nn2,
    n_trials=N_TRIALS_NN2
)

print("\nBest NN (2 Layers) HPO Params Found:")
pprint(nn2_best_params)
tuner_nn2.save_plots(results_dir=os.path.join(RESULTS_DIR, "optuna_plots_nn2"))

print("\n--- Finished NN (2 Hidden Layers) Cross-Validation (Optuna) ---")

In [None]:
# =============================================================================
# Cell 13: NN (2 Hidden Layers) - Final Training
# =============================================================================
print("\n--- Starting NN (2 Hidden Layers) Final Training ---")

# Instantiate handler again
nn2_model = NNModel(input_dim=dh.input_dim, model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
nn2_model.MODEL_NAME = "NN2" # Match name

# Define final training parameters
final_epochs_nn2 = 200
final_patience_nn2 = 40

# Train the final model
trained_nn2 = nn2_model.train_final_model(
    dh=dh,
    final_epochs=final_epochs_nn2,
    final_patience=final_patience_nn2
)

print("\n--- Finished NN (2 Hidden Layers) Final Training ---")

In [None]:
# =============================================================================
# Cell 14: NN (3 Hidden Layers) - Cross-Validation (Optuna)
# =============================================================================
print("\n--- Starting NN (3 Hidden Layers) Cross-Validation (Optuna) ---")
import copy # Ensure copy is imported

# --- Config ---
NUM_HIDDEN_LAYERS_NN3 = 3
N_TRIALS_NN3 = 50 # Adjust
NN3_STUDY_NAME = f"{BASE_STUDY_NAME}_NN_3Layer"
OPTUNA_STORAGE = f"sqlite:///{OPTUNA_DB_PATH}"

# Instantiate NN Model handler
nn3_model = NNModel(input_dim=dh.input_dim, model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
nn3_model.MODEL_NAME = "NN3"

# --- IMPORTANT: Update Search Space ---
# The default space only defines l0, l1. Add l2.
nn_search_space_base = hp_config.get_space("NN")
nn3_search_space = copy.deepcopy(nn_search_space_base)
# Add definition for the 3rd layer (index 2)
nn3_search_space["n_units_l2"] = {"type": "int", "low": 8, "high": 128, "log": True}
# Remove suggestions for layers > 2 if they exist in base config
# nn3_search_space.pop("n_units_l3", None) ... etc
print("NN (3 Layers) Search Space (Added l2):")
pprint(nn3_search_space)

# Define fixed parameters
fixed_params_nn3 = {
    'num_hidden_layers': NUM_HIDDEN_LAYERS_NN3, # Specify architecture
    'pruning_epochs': [10, 25, 50],
    'patience': 15
}
print("\nNN (3 Layers) Fixed HPO Params:")
pprint(fixed_params_nn3)

# Instantiate NN Objective
objective_nn3 = ObjectiveNN(
    data_handler=dh,
    search_space=nn3_search_space,
    fixed_params=fixed_params_nn3
)

# Instantiate Optuna Tuner
tuner_nn3 = HyperparameterTuner(
    study_name=NN3_STUDY_NAME,
    storage_path=OPTUNA_STORAGE,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)

# Run Optuna cross-validation
nn3_best_params = nn3_model.cross_validate(
    dh=dh,
    tuner=tuner_nn3,
    objective_nn=objective_nn3,
    n_trials=N_TRIALS_NN3
)

print("\nBest NN (3 Layers) HPO Params Found:")
pprint(nn3_best_params)
tuner_nn3.save_plots(results_dir=os.path.join(RESULTS_DIR, "optuna_plots_nn3"))

print("\n--- Finished NN (3 Hidden Layers) Cross-Validation (Optuna) ---")

In [None]:
# =============================================================================
# Cell 15: NN (3 Hidden Layers) - Final Training
# =============================================================================
print("\n--- Starting NN (3 Hidden Layers) Final Training ---")

# Instantiate handler again
nn3_model = NNModel(input_dim=dh.input_dim, model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
nn3_model.MODEL_NAME = "NN3" # Match name

# Define final training parameters
final_epochs_nn3 = 200
final_patience_nn3 = 40

# Train the final model
trained_nn3 = nn3_model.train_final_model(
    dh=dh,
    final_epochs=final_epochs_nn3,
    final_patience=final_patience_nn3
)

print("\n--- Finished NN (3 Hidden Layers) Final Training ---")

In [None]:
# =============================================================================
# Cell 16: Load Trained Models
# =============================================================================
print("\n--- Loading Trained Models ---")

# Store loaded model handlers in a dictionary for easier access
loaded_models = {}

# --- Load Ridge ---
try:
    ridge_loader = RidgeModel(model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
    ridge_loader.load_model()
    loaded_models['Ridge'] = ridge_loader
    print("Ridge model loaded.")
except Exception as e:
    print(f"Error loading Ridge model: {e}")

# --- Load XGBoost ---
try:
    xgb_loader = XGBoostModel(model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
    # Best params are loaded automatically by train_final_model if needed,
    # but load_model just loads the saved XGB artifact.
    # Ensure best params file exists if predict needs scaling factors based on HPO settings later.
    # Currently predict for XGBoost does not need HPO params.
    xgb_loader.load_model()
    loaded_models['XGBoost'] = xgb_loader
    print("XGBoost model loaded.")
except Exception as e:
    print(f"Error loading XGBoost model: {e}")

# --- Load NN Models ---
nn_model_names = ["NN0", "NN1", "NN2", "NN3"]
for name in nn_model_names:
    try:
        print(f"Loading {name} model...")
        # Need input_dim from DataHandler
        nn_loader = NNModel(input_dim=dh.input_dim, model_dir=MODELS_DIR, results_dir=RESULTS_DIR)
        # IMPORTANT: Set the correct MODEL_NAME before loading so it finds the right files
        nn_loader.MODEL_NAME = name
        # This will trigger internal loading of best params to reconstruct architecture
        nn_loader.load_model()
        loaded_models[name] = nn_loader
        print(f"{name} model loaded.")
    except Exception as e:
        print(f"Error loading {name} model: {e}")

print("\n--- Finished Loading Models ---")
# Display loaded model keys
print(f"Models loaded: {list(loaded_models.keys())}")

In [None]:
# =============================================================================
# Cell 17: Generate Predictions
# =============================================================================
print("\n--- Generating Predictions for All Models ---")

# Dictionary to store prediction DataFrames
predictions = {}
SAVE_INDIVIDUAL_PREDS = True # Set to True to save county-level predictions

# Iterate through the loaded model handlers
for model_name, model_handler in loaded_models.items():
    print(f"Generating predictions for {model_name}...")
    try:
        # Pass dh and optionally results_dir if saving
        preds_df = model_handler.predict(dh, save=SAVE_INDIVIDUAL_PREDS, results_dir=RESULTS_DIR)
        predictions[model_name] = preds_df
        print(f"  Predictions generated for {model_name} ({preds_df.shape[0]} counties).")
    except Exception as e:
        print(f"  Error generating predictions for {model_name}: {e}")

print("\n--- Finished Generating Predictions ---")
# Display keys of generated predictions
print(f"Predictions available for: {list(predictions.keys())}")
# Optional: Display head of one prediction DataFrame
# if 'Ridge' in predictions:
#     print("\nRidge Predictions Head:")
#     print(predictions['Ridge'].head())

In [None]:
# =============================================================================
# Cell 18: Collect Predictions into Dictionary
# =============================================================================
# The `predictions` dictionary created in the previous cell already serves
# as the `pred_dict` needed for the evaluation function.

# We just assign it to the expected variable name for clarity,
# or directly use the `predictions` dictionary.
pred_dict = predictions

print("\n--- Prediction Dictionary Prepared ---")
print(f"Models in prediction dictionary: {list(pred_dict.keys())}")

# Verify structure (optional)
# if pred_dict:
#     first_key = list(pred_dict.keys())[0]
#     print(f"\nExample DataFrame shape for '{first_key}': {pred_dict[first_key].shape}")
#     print(f"Columns: {pred_dict[first_key].columns.tolist()}")

In [None]:
# =============================================================================
# Cell 19: Evaluate All Models
# =============================================================================
print("\n--- Evaluating All Model Predictions ---")

# Define path for the final aggregate evaluation CSV
EVALUATION_SAVE_PATH = os.path.join(RESULTS_DIR, f"aggregate_evaluation_{TEST_YEAR}.csv")

# Check if pred_dict is populated
if not pred_dict:
    print("Prediction dictionary is empty. Cannot evaluate. Please run prediction cell first.")
else:
    try:
        # Call the evaluation function from the metrics module
        evaluation_df = metrics.evaluate_predictions(
            pred_dict=pred_dict,
            dh=dh,
            save_path=EVALUATION_SAVE_PATH # Pass the full path
        )

        print("\nAggregate Evaluation Results:")
        # Display the full dataframe nicely
        with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
            print(evaluation_df)

    except Exception as e:
        print(f"An error occurred during evaluation: {e}")

print("\n--- Finished Evaluation ---")