# Imports - RF Final Tuned Model

In [None]:
# Reduces overfitting
# Improves generalization
# Scientifically justifies performance
# Makes your hybrid model stronger
# Optimized using 3-fold cross validation with 216 parameter combinations, 
# so we have a good chance of finding a better model than the baseline

# scoring='neg_mean_squared_error'

# Lower MSE = better

# But sklearn uses negative values for minimization

# So a value closer to 0 (less negative) is better

# Example:

# -60 → worse

# -40 → better

# That number is what selected your best parameters


# 3-fold CV does this:

# Split data into 3 equal parts:

# Fold 1 → Train on Fold 2+3, Validate on Fold 1
# Fold 2 → Train on Fold 1+3, Validate on Fold 2
# Fold 3 → Train on Fold 1+2, Validate on Fold 3

# So the model trains and validates 3 times.

# Then:

# Average validation error is computed.

# That average = CV score.


# How did you avoid overfitting?

# You say:

# We used 3-fold cross validation during hyperparameter tuning.

In [1]:
# Core Libraries
import numpy as np
import pandas as pd
import joblib

# Model
from sklearn.ensemble import RandomForestRegressor

# Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Load Preprocessed Data

In [2]:
X_train = joblib.load(r"D:\Python\SmartSignalAI\data\X_train.pkl")
y_train = joblib.load(r"D:\Python\SmartSignalAI\data\y_train.pkl")

X_val   = joblib.load(r"D:\Python\SmartSignalAI\data\X_val.pkl")
y_val   = joblib.load(r"D:\Python\SmartSignalAI\data\y_val.pkl")

X_test  = joblib.load(r"D:\Python\SmartSignalAI\data\X_test.pkl")
y_test  = joblib.load(r"D:\Python\SmartSignalAI\data\y_test.pkl")

print("Preprocessed data loaded successfully!")

Preprocessed data loaded successfully!


# Hyperparameter Grid Definition

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],          # Number of trees
    'max_depth': [None, 10, 20, 30],         # Max depth of each tree
    'min_samples_split': [2, 5, 10],         # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],           # Minimum samples at a leaf node
    'max_features': ['sqrt', 'log2', None]   # Features considered at each split
}

# Grid Search with Cross-Validation

In [None]:
# Initialize Random Forest
rf = RandomForestRegressor(random_state=42)

# Combine train + validation for GridSearch
X_train_val = np.vstack((X_train, X_val))
y_train_val = np.hstack((y_train, y_val))

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Minimize MSE - Model’s estimated generalization performance
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,
    verbose=2
)

# Fit the grid search
grid_search.fit(X_train_val, y_train_val)

# Best parameters & best estimator
print("Best Hyperparameters:", grid_search.best_params_)
print("Best CV Score (neg MSE):", grid_search.best_score_)

# Final tuned RF
rf_model = grid_search.best_estimator_ 

# Out of 216 combinations
# Using 3-fold cross validation
# This model performed best

Fitting 3 folds for each of 216 candidates, totalling 648 fits


324 fits failed out of a total of 648.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
263 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 833, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1329, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "C:\Users\user\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 492, in _validate_params
    validate_parameter_constraints(
    ~~~~~~~~~~~~~

Best Hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


# Test Set Evaluation

In [5]:
y_test_pred = rf_model.predict(X_test)

mae_test  = mean_absolute_error(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2_test   = r2_score(y_test, y_test_pred)

print("Random Forest - Test Metrics")
print("MAE :", round(mae_test, 3))
print("RMSE:", round(rmse_test, 3))
print("R²  :", round(r2_test, 3))

Random Forest - Test Metrics
MAE : 4.05
RMSE: 6.33
R²  : 0.684


# Save the Best Random Forest Model

In [None]:
joblib.dump(
    rf_model,
    r"D:\Python\SmartSignalAI\data\rf_final_model.pkl"
)

print("Random Forest model saved successfully!")

Random Forest model saved successfully!
