# The Upgrade: Automated Model Discovery 🤖
Here's what we're changing:

Hypermodel Creation: We'll replace the static build_model function with a build_hypermodel function. This new function defines a search space for the model's properties (like the number of neurons or layers).

KerasTuner Integration: In the main training logic, we'll set up a KerasTuner RandomSearch object. This "tuner" will build and test multiple models based on the search space we defined.

Automated Search: The tuner will run a search to find the optimal combination of hyperparameters that minimizes the validation loss.

Feature Engineering Suggestion: I'll also add a new, optional section for feature engineering. This is often the single most effective way to boost performance, as it provides the model with more meaningful information.

In [6]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
# --- RESTORED: Added back all metric functions ---
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
import matplotlib.pyplot as plt
import openpyxl
import os

# --- Configuration ---
DATA_FILE = '../regression_data.csv'

# --- Feature Engineering (same as before) ---
def engineer_features(df):
    """Creates new, potentially more informative features."""
    df_copy = df.copy()
    if 'year_completed' in df_copy.columns and 'incident_date_year' in df_copy.columns:
        year_completed = pd.to_numeric(df_copy['year_completed'], errors='coerce')
        incident_year = pd.to_numeric(df_copy['incident_date_year'], errors='coerce')
        df_copy['dam_age_at_incident'] = incident_year - year_completed
    if 'year_modified' in df_copy.columns and 'incident_date_year' in df_copy.columns:
        year_modified = pd.to_numeric(df_copy['year_modified'], errors='coerce').fillna(pd.to_numeric(df_copy['year_completed'], errors='coerce'))
        incident_year = pd.to_numeric(df_copy['incident_date_year'], errors='coerce')
        df_copy['time_since_modification'] = incident_year - year_modified
    return df_copy

INPUT_COLUMNS = [
    'state', 'downstream_hazard_potential', 'owner_type', 'dam_type',
    'primary_purpose_s', 'eap', 'year_completed', 'latitude', 'longitude', 'year_modified',
    'incident_date_year'
]
NEW_TARGET_COLUMNS = [
    'dam_height', 'max_storage_ac_ft', 'surface_area_acres', 'incident_date_month', 'incident_date_day',
    'incident_time_hour', 'number_of_people_evacuated', 'number_of_habitable_structures_evacuated',
    'number_of_habitable_structures_flooded', 'volume_released_at_failure_ac_ft', 'incident_duration'
]

# --- Main Processing Function ---
def train_and_evaluate_lgbm(X, y, target_name, metrics_list):
    print(f"--- Processing target: {target_name} with LightGBM ---")

    # --- Preprocessing for LightGBM ---
    categorical_features_names = X.select_dtypes(include=['object', 'category']).columns
    numerical_features_names = X.select_dtypes(include=np.number).columns

    # Convert object columns to category dtype for LightGBM
    for col in categorical_features_names:
        X[col] = X[col].astype('category')

    # Create a preprocessor to scale only numerical data
    preprocessor = ColumnTransformer(
        [('num', StandardScaler(), numerical_features_names)],
        remainder='passthrough'
    )

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # Recreate DataFrames with correct column names and dtypes for LightGBM
    processed_cols = list(numerical_features_names) + list(categorical_features_names)
    X_train_df = pd.DataFrame(X_train_processed, columns=processed_cols)
    X_test_df = pd.DataFrame(X_test_processed, columns=processed_cols)
    
    # Ensure categorical dtypes are preserved after transformation
    for col in categorical_features_names:
        X_train_df[col] = X_train_df[col].astype('category')
        X_test_df[col] = X_test_df[col].astype('category')

    # --- Model Training (Extremely Fast) ---
    print("Training LightGBM model...")
    model = lgb.LGBMRegressor(random_state=42, n_estimators=200, learning_rate=0.05, num_leaves=31)
    model.fit(X_train_df, y_train, categorical_feature='auto')
    print("Training complete.")

    # --- Evaluation & Reporting ---
    y_pred = model.predict(X_test_df)
    
    # --- RESTORED: Full Metrics Calculation ---
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)

    n = X_test_df.shape[0]  # Number of samples
    p = X_test_df.shape[1]  # Number of predictors
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1) if n - p - 1 > 0 else np.nan

    y_test_non_zero = y_test[y_test != 0]
    y_pred_non_zero = y_pred[y_test != 0]
    mape = np.mean(np.abs((y_test_non_zero - y_pred_non_zero) / y_test_non_zero)) * 100 if len(y_test_non_zero) > 0 else np.nan

    metrics_list.append({
        'Model Output': target_name, 'Task Type': 'Regression (LGBM)', 'MAE': mae, 'MSE': mse,
        'R2_Score': r2, 'Adjusted_R2_Score': adj_r2, 'Explained_Variance_Score': evs, 'MAPE (%)': mape
    })
    print(f"✅ Performance metrics for '{target_name}' collected.")

    # --- RESTORED: Plotting and Detailed Report ---
    plt.figure(figsize=(8, 8))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2)
    plt.title(f'Actual vs. Predicted for {target_name} (LightGBM)')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    
    plot_filename = f'plot_{target_name}.svg'
    plt.tight_layout()
    plt.savefig(plot_filename, format='svg')
    plt.close()
    print(f"Plot saved as {plot_filename}")

    report_filename = f'report_{target_name}.xlsx'
    results_df = X_test.copy()
    results_df['actual_outcome'] = y_test
    results_df['predicted_outcome'] = y_pred
    results_df.to_excel(report_filename, sheet_name='Test_Inputs_and_Predictions', index=False)
    print(f"Detailed report saved as {report_filename}\n" + "-"*40 + "\n")


# --- Main Execution ---
if __name__ == "__main__":
    try:
        df_raw = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        exit()
        
    print("Applying feature engineering...")
    df = engineer_features(df_raw)
    engineered_features = [col for col in df.columns if col not in df_raw.columns]
    print(f"New features created: {engineered_features}")
    current_inputs = list(set(INPUT_COLUMNS + engineered_features))
    
    model_metrics_data = []

    for target in NEW_TARGET_COLUMNS:
        if target not in df.columns or not pd.api.types.is_numeric_dtype(df[target]):
            continue
            
        final_inputs = [col for col in current_inputs if col in df.columns and col != target]
        temp_df = df[final_inputs + [target]].dropna()
        
        if len(temp_df) < 50:
             print(f"Warning: Too little data for '{target}' after dropping NaNs ({len(temp_df)} rows). Skipping.")
             continue

        X_filtered = temp_df[final_inputs]
        y = temp_df[target]
        train_and_evaluate_lgbm(X_filtered, y, target, model_metrics_data)

    if model_metrics_data:
        metrics_df = pd.DataFrame(model_metrics_data)
        metrics_filename = 'model_performance_metrics_lgbm.xlsx'
        metrics_df.to_excel(metrics_filename, index=False)
        print(f"\n✅ All LightGBM performance metrics saved to '{metrics_filename}'.")

    print("\nAll tasks complete.")

Applying feature engineering...
New features created: ['dam_age_at_incident', 'time_since_modification']
--- Processing target: dam_height with LightGBM ---
Training LightGBM model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 942
[LightGBM] [Info] Number of data points in the train set: 1075, number of used features: 12
[LightGBM] [Info] Start training from score 31.770233
Training complete.
✅ Performance metrics for 'dam_height' collected.
Plot saved as plot_dam_height.svg
Detailed report saved as report_dam_height.xlsx
----------------------------------------

--- Processing target: max_storage_ac_ft with LightGBM ---
Training LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c

In [4]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 578.7 kB/s eta 0:00:02
   -------------- ------------------------- 0.5/1.5 MB 578.7 kB/s eta 0:00:02
   --------------------- ------------------ 0.8/1.5 MB 578.7 kB/s eta 0:00:02
   --------------------- ------------------ 0.8/1.5 MB 578.7 kB/s eta 0:00:02
   ---------------------------- ----------- 1.0/1.5 MB 606.3 kB/s eta 0:00:01
   ---------------------------- ----------- 1.0/1.5 MB 606.3 kB/s eta 0:00:01
   ------------------------------------ --- 1.3/1.5 MB 610.0 kB/s eta 0:00:01
   ---------------------------