In [2]:
import os
import pandas as pd
import numpy as np
import time
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error, explained_variance_score


In [3]:

# Load dataset
df = pd.read_excel(r'C:\Users\Asus\Downloads\voting\NPV.xlsx')

In [4]:
# Define features and target
X = df.iloc[:100, :-1]
y = df.iloc[:100, -1]


# Split data into train and test sets
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [5]:

# Specify the folder to save results
save_folder = f"voting_regressor_results_tree_boost{random_state}"
os.makedirs(save_folder, exist_ok=True)


In [6]:
# Define models
models = {
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor()
}

param_grids = {
    "DecisionTree": {"max_depth": [3, 5, 10], "min_samples_split": [2, 5, 7], "min_samples_leaf": [1, 2, 4]},
    "RandomForest": {"n_estimators": [10, 50, 100], "max_depth": [3, 5, 10], "min_samples_split": [2, 5, 7], "min_samples_leaf": [1, 2, 4]},
    "AdaBoost": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 1]},
    "GradientBoosting": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2], "max_depth": [3, 5, 10]},
    "XGBoost": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2], "max_depth": [3, 5, 10]}
}

In [7]:
best_models = {}
all_cv_results = {}
tuning_times = {}

# Perform Grid Search with Cross-Validation for each model
for name, model in models.items():
    print(f"Tuning {name}...")

    start_time = time.time()  # Start time for tuning
    gcv = GridSearchCV(model, param_grids[name], cv=5, scoring='r2', verbose=2)
    gcv.fit(X_train, y_train)
    end_time = time.time()  # End time for tuning
    
    best_models[name] = gcv.best_estimator_
    all_cv_results[name] = pd.DataFrame(gcv.cv_results_)

    # Save best parameters and CV results
    pd.DataFrame([gcv.best_params_]).to_csv(os.path.join(save_folder, f"best_params_{name}.csv"), index=False)
    all_cv_results[name].to_csv(os.path.join(save_folder, f"cv_results_{name}.csv"), index=False)

    # Save tuning time
    tuning_times[name] = end_time - start_time

Tuning DecisionTree...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=7; total time=   0.0s
[CV] END max_

In [8]:


# Create a voting regressor
voting_regressor = VotingRegressor(estimators=[(name, best_models[name]) for name in best_models])
voting_regressor.fit(X_train, y_train)
best_models["VotingRegressor"] = voting_regressor


In [9]:


# Save tuning times
with open(os.path.join(save_folder, "tuning_times.txt"), "w") as file:
    for model, time_taken in tuning_times.items():
        file.write(f"{model}: {time_taken:.2f} seconds\n")

# Save models
for name, model in best_models.items():
    joblib.dump(model, os.path.join(save_folder, f"best_model_{name}.pkl"))

In [14]:
def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

n, p = X.shape


# Evaluate models
metrics = {}
y_whole_pred = {}
for name, model in best_models.items():
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_whole_pred[name] = model.predict(X)
    
    metrics[name] = {
        "Train R2": r2_score(y_train, y_train_pred),
        "Train Adj R2": adjusted_r2(r2_score(y_train, y_train_pred), len(y_train), p),
        "Train RMSE": root_mean_squared_error(y_train, y_train_pred),
        "Train MSE": mean_squared_error(y_train, y_train_pred),
        "Train MAE": mean_absolute_error(y_train, y_train_pred),
        "Train MAPE": mean_absolute_percentage_error(y_train, y_train_pred),
        "Test R2": r2_score(y_test, y_test_pred),
        "Test Adj R2": adjusted_r2(r2_score(y_test, y_test_pred), len(y_test), p),
        "Test RMSE": root_mean_squared_error(y_test, y_test_pred),
        "Test MSE": mean_squared_error(y_test, y_test_pred),
        "Test MAE": mean_absolute_error(y_test, y_test_pred),
        "Test MAPE": mean_absolute_percentage_error(y_test, y_test_pred),
        "Whole R2": r2_score(y, y_whole_pred[name]),
        "Whole Adj R2": adjusted_r2(r2_score(y, y_whole_pred[name]), len(y), p),
        "Whole RMSE": root_mean_squared_error(y, y_whole_pred[name]),
        "Whole MSE": mean_squared_error(y, y_whole_pred[name]),
        "Whole MAE": mean_absolute_error(y, y_whole_pred[name]),
        "Whole MAPE": mean_absolute_percentage_error(y, y_whole_pred[name])
    }
    
    # Save individual model results
    pd.DataFrame([metrics[name]]).to_csv(os.path.join(save_folder, f"metrics_{name}.csv"), index=False)
    

for name, model in best_models.items():
    # Define the Excel file path for each model
    excel_path = os.path.join(save_folder, f"{name}_predictions.xlsx")

    with pd.ExcelWriter(excel_path) as writer:
        # Save y_train and y_train_pred in one sheet
        pd.DataFrame({"y_train": y_train, "y_train_pred": y_train_pred}).to_excel(writer, sheet_name="Train Predictions", index=False)
        
        # Save y_test and y_test_pred in another sheet
        pd.DataFrame({"y_test": y_test, "y_test_pred": y_test_pred}).to_excel(writer, sheet_name="Test Predictions", index=False)
        
        # Save y_whole and y_whole_pred in another sheet
        pd.DataFrame({"y_whole": y, "y_whole_pred": y_whole_pred[name]}).to_excel(writer, sheet_name="Whole Predictions", index=False)

print("Predictions saved successfully for all models!")
# Save all metrics to a single CSV
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv(os.path.join(save_folder, "all_models_metrics.csv"), index=True)

print("All models evaluated and results saved successfully.")

Predictions saved successfully for all models!
All models evaluated and results saved successfully.


In [15]:
metrics_df

Unnamed: 0,DecisionTree,RandomForest,AdaBoost,GradientBoosting,XGBoost,VotingRegressor
Train R2,0.898839,0.974407,0.95963,0.998997,0.999773,0.984311
Train Adj R2,0.890525,0.972304,0.956312,0.998915,0.999754,0.983021
Train RMSE,1.113409,0.560028,0.703358,0.110849,0.052762,0.438478
Train MSE,1.23968,0.313631,0.494712,0.012287,0.002784,0.192263
Train MAE,0.850809,0.407655,0.581422,0.088493,0.038489,0.343682
Train MAPE,0.569118,0.256517,0.385879,0.069642,0.03004,0.22071
Test R2,0.783972,0.881827,0.871454,0.884963,0.886472,0.880636
Test Adj R2,0.684267,0.827285,0.812125,0.831868,0.834074,0.825545
Test RMSE,1.679279,1.242016,1.295379,1.225427,1.217362,1.248257
Test MSE,2.819979,1.542604,1.678007,1.501671,1.481969,1.558146
