# AutoML-Fire: Two-Stage Baseline (Notebook-first)

This notebook implements a two-stage AutoML approach for fire prediction:
1. **Stage 1**: Binary classification to predict fire occurrence (y > 0)
2. **Stage 2**: Regression to predict fire intensity on fire days only

The approach uses time-aware cross-validation to prevent data leakage and includes comprehensive feature engineering with rolling statistics and lag features.


In [None]:
from dataclasses import dataclass
from typing import List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

@dataclass
class Config:
    """Configuration for AutoML two-stage baseline"""
    # Data paths
    data_csv_path: str = "data/combined_top30_dataset.csv"  # Placeholder - update with your path
    
    # Column specifications
    date_col: str = "date"
    target_col: str = "fire_count"
    group_cols: Tuple[str, str] = ("cluster", "grid_id")
    static_cols: List[str] = ("elevation", "slope", "aspect", "landcover")  # Allow absent
    drop_cols: List[str] = ("lat", "lon")  # Optional columns to drop
    
    # Cross-validation
    n_splits: int = 5
    seed: int = 42
    
    # Hyperparameter optimization
    use_optuna: bool = True
    n_trials: int = 40

# Initialize configuration
CFG = Config()
print(f"Configuration loaded: {CFG.n_splits} CV splits, Optuna: {CFG.use_optuna}")


In [None]:
# Imports with safe fallbacks
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score, 
    mean_absolute_error, mean_squared_error, r2_score
)

# Try LightGBM first, fallback to HistGradientBoosting
try:
    import lightgbm as lgb
    from lightgbm import LGBMClassifier, LGBMRegressor
    LIGHTGBM_AVAILABLE = True
    print("âœ“ LightGBM available")
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("âš  LightGBM not available, using HistGradientBoosting")

if not LIGHTGBM_AVAILABLE:
    from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor

# Try Optuna for hyperparameter optimization
try:
    import optuna
    OPTUNA_AVAILABLE = True
    print("âœ“ Optuna available")
except ImportError:
    OPTUNA_AVAILABLE = False
    print("âš  Optuna not available, using default parameters")

print(f"Available models: LightGBM={LIGHTGBM_AVAILABLE}, Optuna={OPTUNA_AVAILABLE}")


In [None]:
# Load and prepare data
print("Loading data...")
df = pd.read_csv(CFG.data_csv_path)

# Parse date column
df[CFG.date_col] = pd.to_datetime(df[CFG.date_col])
df = df.sort_values(CFG.date_col).reset_index(drop=True)

print(f"Data shape: {df.shape}")
print(f"Date range: {df[CFG.date_col].min()} to {df[CFG.date_col].max()}")

# Check for missing values
print("\nMissing values per column:")
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0])

# Check for required columns
required_cols = [CFG.date_col, CFG.target_col] + list(CFG.group_cols)
missing_required = [col for col in required_cols if col not in df.columns]
if missing_required:
    raise ValueError(f"Missing required columns: {missing_required}")

# Check for static columns
missing_static = [col for col in CFG.static_cols if col not in df.columns]
if missing_static:
    print(f"âš  Warning: Missing static columns: {missing_static}")

print(f"\nTarget distribution:")
print(df[CFG.target_col].describe())
print(f"Fire days (y > 0): {(df[CFG.target_col] > 0).sum()} / {len(df)} ({100*(df[CFG.target_col] > 0).mean():.1f}%)")


In [None]:
# Feature Engineering
print("Creating features...")

# Create binary classification target
df['y_cls'] = (df[CFG.target_col] > 0).astype(int)
print(f"Classification target: {df['y_cls'].mean():.3f} positive rate")

# Create regression target (log1p for positive values)
df['y_reg'] = np.log1p(df[CFG.target_col])
print(f"Regression target (log1p): mean={df['y_reg'].mean():.3f}, std={df['y_reg'].std():.3f}")

# Calendar features
df['month'] = df[CFG.date_col].dt.month
df['day_of_year'] = df[CFG.date_col].dt.dayofyear

# Cyclical encoding for seasonality
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)

print("âœ“ Calendar features created")


In [None]:
# Create rolling statistics and lag features (prevent leakage)
print("Creating temporal features...")

# Key meteorological variables for rolling stats
meteo_vars = ['humidity', 'windspeed', 'rain', 'soil_moisture', 'tmin', 'tmax', 'ndvi', 'cloudcover']
available_meteo = [var for var in meteo_vars if var in df.columns]
print(f"Available meteorological variables: {available_meteo}")

# Rolling windows (past-only to prevent leakage)
rolling_windows = [3, 7, 14, 30]
lag_days = [1, 2, 3, 7]

# Group by spatial units to prevent cross-contamination
group_cols = [col for col in CFG.group_cols if col in df.columns]
print(f"Grouping by: {group_cols}")

for var in available_meteo:
    if var in df.columns:
        # Rolling statistics (past-only)
        for window in rolling_windows:
            df[f'{var}_roll{window}_mean'] = df.groupby(group_cols)[var].shift(1).rolling(window, min_periods=1).mean()
            df[f'{var}_roll{window}_std'] = df.groupby(group_cols)[var].shift(1).rolling(window, min_periods=1).std()
            df[f'{var}_roll{window}_max'] = df.groupby(group_cols)[var].shift(1).rolling(window, min_periods=1).max()
            df[f'{var}_roll{window}_min'] = df.groupby(group_cols)[var].shift(1).rolling(window, min_periods=1).min()
        
        # Lag features
        for lag in lag_days:
            df[f'{var}_lag{lag}'] = df.groupby(group_cols)[var].shift(lag)

# Target lag features
for lag in lag_days:
    df[f'{CFG.target_col}_lag{lag}'] = df.groupby(group_cols)[CFG.target_col].shift(lag)

print("âœ“ Temporal features created (rolling stats and lags)")
print(f"Total features created: {len([col for col in df.columns if any(x in col for x in ['_roll', '_lag'])])}")


In [None]:
# Optional fire danger indices (lightweight approximations)
print("Creating fire danger indices...")

# Simple FWI-like components if variables available
if 'tmax' in df.columns and 'humidity' in df.columns and 'windspeed' in df.columns:
    # Temperature component (simplified)
    df['temp_component'] = np.maximum(0, df['tmax'] - 20)  # Base temperature
    
    # Humidity component (inverse relationship)
    df['humidity_component'] = np.maximum(0, 100 - df['humidity'])
    
    # Wind component
    df['wind_component'] = df['windspeed'] ** 2
    
    # Simple fire danger index
    df['fire_danger_index'] = (df['temp_component'] * df['humidity_component'] * df['wind_component']) / 1000
    print("âœ“ Fire danger index created")
else:
    print("âš  Insufficient variables for fire danger index")

# KBDI-like soil moisture proxy
if 'soil_moisture' in df.columns and 'rain' in df.columns:
    # Simple drought index (inverse of soil moisture, reduced by rain)
    df['drought_index'] = np.maximum(0, 100 - df['soil_moisture'] - df['rain'])
    print("âœ“ Drought index created")
else:
    print("âš  Insufficient variables for drought index")


In [None]:
# Prepare feature matrix and targets
print("Preparing feature matrix...")

# Define feature columns (exclude targets, dates, and group IDs)
exclude_cols = [CFG.date_col, CFG.target_col, 'y_cls', 'y_reg'] + list(CFG.group_cols)
if CFG.drop_cols:
    exclude_cols.extend([col for col in CFG.drop_cols if col in df.columns])

feature_cols = [col for col in df.columns if col not in exclude_cols]
print(f"Feature columns: {len(feature_cols)}")

# Create feature matrix
X = df[feature_cols].copy()
y_cls = df['y_cls'].copy()
y_reg = df['y_reg'].copy()

# Handle missing values in features
print(f"Missing values in features: {X.isnull().sum().sum()}")
X = X.fillna(X.median())  # Simple imputation for now

print(f"Final feature matrix shape: {X.shape}")
print(f"Classification target distribution: {y_cls.value_counts().to_dict()}")
print(f"Regression target stats: mean={y_reg.mean():.3f}, std={y_reg.std():.3f}")


In [None]:
# Time-aware cross-validation setup
print("Setting up time-aware cross-validation...")

# Create time series split
tscv = TimeSeriesSplit(n_splits=CFG.n_splits, gap=0)
print(f"Time series CV: {CFG.n_splits} splits")

# Store CV results
cv_results_cls = []
cv_results_reg = []
oof_probs_cls = np.zeros(len(X))
oof_preds_reg = np.zeros(len(X))

print("âœ“ Cross-validation setup complete")


In [None]:
# Stage 1: Binary Classification
print("Training Stage 1: Binary Classification...")

# Define classifier with safe fallbacks
if LIGHTGBM_AVAILABLE:
    clf = LGBMClassifier(
        objective='binary',
        class_weight='balanced',
        random_state=CFG.seed,
        verbose=-1,
        n_estimators=100
    )
    print("Using LightGBM Classifier")
else:
    clf = HistGradientBoostingClassifier(
        random_state=CFG.seed,
        max_iter=100
    )
    print("Using HistGradientBoosting Classifier")

# Cross-validation for classification
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f"Fold {fold + 1}/{CFG.n_splits}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_cls.iloc[train_idx], y_cls.iloc[val_idx]
    
    # Train classifier
    clf.fit(X_train, y_train)
    
    # Predict probabilities
    y_pred_proba = clf.predict_proba(X_val)[:, 1]
    oof_probs_cls[val_idx] = y_pred_proba
    
    # Calculate metrics
    y_pred_binary = (y_pred_proba > 0.5).astype(int)
    
    auc = roc_auc_score(y_val, y_pred_proba)
    ap = average_precision_score(y_val, y_pred_proba)
    f1 = f1_score(y_val, y_pred_binary)
    
    cv_results_cls.append({
        'fold': fold + 1,
        'auc': auc,
        'ap': ap,
        'f1': f1
    })
    
    print(f"  AUC: {auc:.3f}, AP: {ap:.3f}, F1: {f1:.3f}")

# Overall classification metrics
overall_auc = roc_auc_score(y_cls, oof_probs_cls)
overall_ap = average_precision_score(y_cls, oof_probs_cls)
overall_f1 = f1_score(y_cls, (oof_probs_cls > 0.5).astype(int))

print(f"\nOverall Classification Results:")
print(f"AUC: {overall_auc:.3f}")
print(f"Average Precision: {overall_ap:.3f}")
print(f"F1 Score: {overall_f1:.3f}")


In [None]:
# Stage 2: Regression (on fire days only)
print("Training Stage 2: Regression...")

# Define regressor with safe fallbacks
if LIGHTGBM_AVAILABLE:
    reg = LGBMRegressor(
        objective='tweedie',  # Good for count data
        random_state=CFG.seed,
        verbose=-1,
        n_estimators=100
    )
    print("Using LightGBM Regressor (Tweedie)")
else:
    reg = HistGradientBoostingRegressor(
        random_state=CFG.seed,
        max_iter=100
    )
    print("Using HistGradientBoosting Regressor")

# Cross-validation for regression (only on positive samples)
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f"Fold {fold + 1}/{CFG.n_splits}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_reg.iloc[train_idx], y_reg.iloc[val_idx]
    
    # Only train on positive samples (fire days)
    fire_mask_train = y_cls.iloc[train_idx] == 1
    fire_mask_val = y_cls.iloc[val_idx] == 1
    
    if fire_mask_train.sum() > 0:  # Only if we have fire days in training
        X_train_fire = X_train[fire_mask_train]
        y_train_fire = y_train[fire_mask_train]
        
        # Train regressor
        reg.fit(X_train_fire, y_train_fire)
        
        # Predict on all validation samples
        y_pred_reg = reg.predict(X_val)
        oof_preds_reg[val_idx] = y_pred_reg
        
        # Calculate metrics on fire days only
        if fire_mask_val.sum() > 0:
            y_val_fire = y_val[fire_mask_val]
            y_pred_fire = y_pred_reg[fire_mask_val]
            
            mae = mean_absolute_error(y_val_fire, y_pred_fire)
            rmse = np.sqrt(mean_squared_error(y_val_fire, y_pred_fire))
            r2 = r2_score(y_val_fire, y_pred_fire)
            
            cv_results_reg.append({
                'fold': fold + 1,
                'mae': mae,
                'rmse': rmse,
                'r2': r2,
                'n_fire_days': fire_mask_val.sum()
            })
            
            print(f"  MAE: {mae:.3f}, RMSE: {rmse:.3f}, RÂ²: {r2:.3f} (n_fire={fire_mask_val.sum()})")
        else:
            print(f"  No fire days in validation fold")
    else:
        print(f"  No fire days in training fold")

# Overall regression metrics (on fire days only)
fire_mask = y_cls == 1
if fire_mask.sum() > 0:
    overall_mae = mean_absolute_error(y_reg[fire_mask], oof_preds_reg[fire_mask])
    overall_rmse = np.sqrt(mean_squared_error(y_reg[fire_mask], oof_preds_reg[fire_mask]))
    overall_r2 = r2_score(y_reg[fire_mask], oof_preds_reg[fire_mask])
    
    print(f"\nOverall Regression Results (fire days only):")
    print(f"MAE: {overall_mae:.3f}")
    print(f"RMSE: {overall_rmse:.3f}")
    print(f"RÂ²: {overall_r2:.3f}")
    print(f"Fire days evaluated: {fire_mask.sum()}")
else:
    print("No fire days found for regression evaluation")


In [None]:
# Optuna Hyperparameter Optimization (Optional)
if CFG.use_optuna and OPTUNA_AVAILABLE:
    print("Running Optuna hyperparameter optimization...")
    
    def objective_cls(trial):
        if LIGHTGBM_AVAILABLE:
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 200),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'num_leaves': trial.suggest_int('num_leaves', 10, 100),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
                'random_state': CFG.seed,
                'verbose': -1
            }
            model = LGBMClassifier(**params)
        else:
            params = {
                'max_iter': trial.suggest_int('max_iter', 50, 200),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 50),
                'random_state': CFG.seed
            }
            model = HistGradientBoostingClassifier(**params)
        
        # Simple validation on one fold
        train_idx, val_idx = list(tscv.split(X))[0]
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y_cls.iloc[train_idx], y_cls.iloc[val_idx]
        
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_val)[:, 1]
        return roc_auc_score(y_val, y_pred_proba)
    
    # Run optimization
    study_cls = optuna.create_study(direction='maximize')
    study_cls.optimize(objective_cls, n_trials=CFG.n_trials, show_progress_bar=True)
    
    print(f"Best classification params: {study_cls.best_params}")
    print(f"Best classification AUC: {study_cls.best_value:.3f}")
    
    # Update classifier with best params
    if LIGHTGBM_AVAILABLE:
        clf = LGBMClassifier(**study_cls.best_params, verbose=-1)
    else:
        clf = HistGradientBoostingClassifier(**study_cls.best_params)
    
else:
    print("Skipping Optuna optimization (disabled or not available)")


In [None]:
# Final Training on Full Data
print("Training final models on full dataset...")

# Train final classifier on all data
clf.fit(X, y_cls)
print("âœ“ Final classifier trained")

# Train final regressor on fire days only
fire_mask = y_cls == 1
if fire_mask.sum() > 0:
    X_fire = X[fire_mask]
    y_fire = y_reg[fire_mask]
    reg.fit(X_fire, y_fire)
    print(f"âœ“ Final regressor trained on {fire_mask.sum()} fire days")
else:
    print("âš  No fire days found for regressor training")

# Store trained models in memory
trained_classifier = clf
trained_regressor = reg
print("Models saved to memory variables: trained_classifier, trained_regressor")


In [None]:
# Evaluation Summary
print("=== EVALUATION SUMMARY ===")

# Classification results
print("\nðŸ“Š CLASSIFICATION RESULTS:")
print(f"Overall AUC: {overall_auc:.3f}")
print(f"Overall Average Precision: {overall_ap:.3f}")
print(f"Overall F1 Score: {overall_f1:.3f}")

print("\nPer-fold Classification Results:")
for result in cv_results_cls:
    print(f"  Fold {result['fold']}: AUC={result['auc']:.3f}, AP={result['ap']:.3f}, F1={result['f1']:.3f}")

# Regression results
if cv_results_reg:
    print(f"\nðŸ“Š REGRESSION RESULTS (fire days only):")
    print(f"Overall MAE: {overall_mae:.3f}")
    print(f"Overall RMSE: {overall_rmse:.3f}")
    print(f"Overall RÂ²: {overall_r2:.3f}")
    
    print("\nPer-fold Regression Results:")
    for result in cv_results_reg:
        print(f"  Fold {result['fold']}: MAE={result['mae']:.3f}, RMSE={result['rmse']:.3f}, RÂ²={result['r2']:.3f} (n_fire={result['n_fire_days']})")

# Two-stage combined predictions
print(f"\nðŸ”— TWO-STAGE COMBINED PREDICTIONS:")
expected_counts = oof_probs_cls * np.expm1(oof_preds_reg)  # p_fire * exp(y_reg) = expected count
print(f"Expected count range: {expected_counts.min():.3f} to {expected_counts.max():.3f}")
print(f"Mean expected count: {expected_counts.mean():.3f}")

# Compare with actual
actual_counts = df[CFG.target_col].values
print(f"Actual count range: {actual_counts.min():.3f} to {actual_counts.max():.3f}")
print(f"Mean actual count: {actual_counts.mean():.3f}")

# Overall MAE on expected counts
overall_mae_combined = mean_absolute_error(actual_counts, expected_counts)
print(f"Combined MAE (expected vs actual): {overall_mae_combined:.3f}")


In [None]:
# Inference Helper Function
def predict_next_day(df_recent_window_per_grid, trained_clf=None, trained_reg=None):
    """
    Predict fire count for next day given recent window of data per grid.
    
    Args:
        df_recent_window_per_grid: DataFrame with recent data for each grid
        trained_clf: Trained classifier (uses global if None)
        trained_reg: Trained regressor (uses global if None)
    
    Returns:
        dict: {grid_id: expected_fire_count} for each grid
    """
    if trained_clf is None:
        trained_clf = trained_classifier
    if trained_reg is None:
        trained_reg = trained_regressor
    
    # Ensure we have the same feature columns as training
    feature_cols = [col for col in df_recent_window_per_grid.columns 
                   if col not in [CFG.date_col, CFG.target_col, 'y_cls', 'y_reg'] + list(CFG.group_cols)]
    
    # Handle missing values
    X_pred = df_recent_window_per_grid[feature_cols].fillna(df_recent_window_per_grid[feature_cols].median())
    
    # Stage 1: Predict fire probability
    p_fire = trained_clf.predict_proba(X_pred)[:, 1]
    
    # Stage 2: Predict fire intensity (log1p scale)
    y_reg_pred = trained_reg.predict(X_pred)
    
    # Combine: expected count = p_fire * exp(y_reg_pred)
    expected_counts = p_fire * np.expm1(y_reg_pred)
    
    # Return as dictionary
    grid_ids = df_recent_window_per_grid[CFG.group_cols[1]] if CFG.group_cols[1] in df_recent_window_per_grid.columns else range(len(expected_counts))
    return dict(zip(grid_ids, expected_counts))

# Example usage (commented out)
# recent_data = df.tail(100)  # Last 100 days
# predictions = predict_next_day(recent_data)
# print(f"Example predictions: {list(predictions.items())[:5]}")

print("âœ“ Inference helper function created: predict_next_day()")
print("Usage: predictions = predict_next_day(your_recent_data_df)")
