In [None]:
def run_model_cv(model, X, y, X_test, model_name="model", n_splits=5):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    oof_preds = np.zeros((X.shape[0], ))
    test_preds = np.zeros(X_test.shape[0])
    fold_metrics = []

    for fold, (trn_idx, val_idx) in enumerate(kfold.split(X)):
        print(f"\nTraining fold {fold + 1} - {model_name}")
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        val_pred = model.predict(X_val)
        test_pred = model.predict(X_test)

        oof_preds[val_idx] = val_pred
        test_preds += test_pred / n_splits

        fold_metrics.append({
            "fold": fold + 1,
            "RMSPE": rmspe(y_val.values, val_pred),
            "RMSE": mean_squared_error(y_val.values, val_pred, squared=False),
            "MAE": mean_absolute_error(y_val.values, val_pred),
            "R2": r2_score(y_val.values, val_pred)
        })

        print(f"Fold {fold+1} - RMSPE: {fold_metrics[-1]['RMSPE']:.6f}, "
              f"RMSE: {fold_metrics[-1]['RMSE']:.6f}, "
              f"MAE: {fold_metrics[-1]['MAE']:.6f}, "
              f"R2: {fold_metrics[-1]['R2']:.4f}")

    # OOF evaluation
    print(f"\nFinal Evaluation - {model_name}")
    print(f"RMSPE: {rmspe(y.values, oof_preds):.6f}")
    print(f"RMSE : {mean_squared_error(y.values, oof_preds, squared=False):.6f}")
    print(f"MAE  : {mean_absolute_error(y.values, oof_preds):.6f}")
    print(f"R2   : {r2_score(y.values, oof_preds):.4f}")

    return oof_preds, test_preds, pd.DataFrame(fold_metrics)

In [15]:
# Baseline 1: Random Forest
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

oof_rf, test_rf, rf_metrics = run_model_cv(rf_model, X, y, X_test, model_name="RandomForest")



Training fold 1 - RandomForest
Fold 1 - RMSPE: 0.265749, RMSE: 0.001122, MAE: 0.000658, R2: 0.8555

Training fold 2 - RandomForest
Fold 2 - RMSPE: 0.268869, RMSE: 0.001105, MAE: 0.000663, R2: 0.8555

Training fold 3 - RandomForest
Fold 3 - RMSPE: 0.272862, RMSE: 0.001106, MAE: 0.000659, R2: 0.8600

Training fold 4 - RandomForest
Fold 4 - RMSPE: 0.261642, RMSE: 0.001104, MAE: 0.000665, R2: 0.8589

Training fold 5 - RandomForest
Fold 5 - RMSPE: 0.263645, RMSE: 0.001130, MAE: 0.000662, R2: 0.8510

Final Evaluation - RandomForest
RMSPE: 0.266583
RMSE : 0.001113
MAE  : 0.000662
R2   : 0.8562


In [16]:
# Baseline 2: XGBoost
xgb_model = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method='hist',
    n_jobs=-1
)

oof_xgb, test_xgb, xgb_metrics = run_model_cv(xgb_model, X, y, X_test, model_name="XGBoost")


Training fold 1 - XGBoost
Fold 1 - RMSPE: 0.258864, RMSE: 0.001064, MAE: 0.000630, R2: 0.8701

Training fold 2 - XGBoost
Fold 2 - RMSPE: 0.259471, RMSE: 0.001037, MAE: 0.000630, R2: 0.8728

Training fold 3 - XGBoost
Fold 3 - RMSPE: 0.264711, RMSE: 0.001046, MAE: 0.000630, R2: 0.8748

Training fold 4 - XGBoost
Fold 4 - RMSPE: 0.252117, RMSE: 0.001042, MAE: 0.000634, R2: 0.8742

Training fold 5 - XGBoost
Fold 5 - RMSPE: 0.254336, RMSE: 0.001075, MAE: 0.000634, R2: 0.8653

Final Evaluation - XGBoost
RMSPE: 0.257937
RMSE : 0.001053
MAE  : 0.000632
R2   : 0.8715
