In [877]:
import time

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.linear_model import ElasticNet, HuberRegressor, Lasso, LinearRegression, OrthogonalMatchingPursuit, Ridge, RANSACRegressor, TheilSenRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cross_decomposition import PLSRegression

from warnings import filterwarnings
filterwarnings('ignore')

In [878]:
df_train_copy = pd.read_csv('../Datasets/preprocessed_datasets/final_selected_features_data_copy.csv')
df_test = pd.read_csv('../Datasets/preprocessed_datasets/final_selected_features_df_test.csv')

In [879]:
features = [col for col in df_train_copy.columns if col != "SalePrice"]
print(features)

['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF', 'GarageFinish', 'Fireplaces', 'OpenPorchSF', 'LotArea', 'CentralAir_Y', 'YearRemodAdd_Age', 'YearBuilt_Age']


In [880]:
# Define hyperparameter grids for each model
param_grids = {
    # "GradientBoostingRegressor": {
    #     'n_estimators': [50, 1000, 3000, 5000],
    #     'learning_rate': [0.05, 0.1, 0.15],
    #     'max_depth': [3, 5, 8],
    #     'loss': ['huber', 'quantile'],
    #     'min_samples_split': [5, 10, 15],
    #     'min_samples_leaf': [5, 10, 15],
    #     'max_features': [None, 'sqrt', 'log2']
    # },
    # "GradientBoostingRegressor": {
    #     'learning_rate': [0.05], 
    #     'loss': ['huber'], 
    #     'max_depth': [4], 
    #     'max_features': ['sqrt'], 
    #     'min_samples_leaf': [15], 
    #     'min_samples_split': [10], 
    #     'n_estimators': [280]
    # },
    # "AdaBoostRegressor": {
    #     'n_estimators': [50, 150, 1000, 3000, 5000],
    #     'learning_rate': [0.05, 0.1, 0.15],
    #     'loss': ['linear', 'square', 'exponential'],
    # },
    # "AdaBoostRegressor": {
    #     'n_estimators': [150],
    #     'learning_rate': [0.05],
    #     'loss': ['exponential'],
    # },
    # "RandomForestRegressor": {
    #     'n_estimators': [50, 1000, 3000, 5000],
    #     'max_depth': [3, 5, 8],
    #     'min_samples_split': [5, 10, 15],
    #     'min_samples_leaf': [5, 10, 15],
    #     'bootstrap': [True, False]
    # },
    # "RandomForestRegressor": {
    #     'n_estimators': [450],
    #     'max_depth': [8],
    #     'min_samples_split': [5],
    #     'min_samples_leaf': [5],
    #     'bootstrap': [True]
    # },
    # "BaggingRegressor": {
    #     'n_estimators': [50, 100, 150, 200],
    #     'max_samples': [0.5, 0.7, 0.9],
    #     'max_features': [0.5, 0.7, 0.9],
    #     'bootstrap': [True, False],
    #     'bootstrap_features': [True, False]
    # },
    # "BaggingRegressor": {
    #     'n_estimators': [300],
    #     'max_samples': [0.7],
    #     'max_features': [0.7],
    #     'bootstrap': [True],
    #     'bootstrap_features': [False]
    # },
    # "XGBRegressor": {
    #     'n_estimators': [150, 200, 250, 300],
    #     'learning_rate': [0.03, 0.05],
    #     'max_depth': [3, 5],
    #     'colsample_bytree': [0.4, 0.45, 0.5],
    #     'gamma': [0.05, 0.1, 0.15],
    #     'min_child_weight': [1, 2, 3],
    #     'reg_alpha': [0.5, 1, 1.5],
    #     'reg_lambda': [0.5, 1, 1.5],
    #     'subsample': [0.5, 0.75, 0.9],
    # },
    # "XGBRegressor": {
    #     'colsample_bytree': 0.5,
    #     'gamma': 0.05,
    #     'learning_rate': 0.03,
    #     'max_depth': 5,
    #     'min_child_weight': 2,
    #     'n_estimators': 300,
    #     'reg_alpha': 0.5,
    #     'reg_lambda': 1,
    #     'subsample': 0.5
    # },
    # "LGBMRegressor": {
    #     'n_estimators': [300, 500, 800, 1000],
    #     'learning_rate': [0.03, 0.05],
    #     'max_depth': [3, 5, -1],  # -1 means no limit
    #     'num_leaves': [3, 5, 8],  # Number of leaves in full trees
    #     'min_child_samples': [5, 11, 20, 50],  # Minimum number of samples per leaf
    #     'subsample': [0.6, 0.8, 1.0],  # Fraction of samples for each tree
    #     'colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features for each tree
    #     'reg_alpha': [0, 0.1, 1, 10],  # L1 regularization term
    #     'reg_lambda': [0, 0.1, 1, 10],  # L2 regularization term
    #     'min_split_gain': [0, 0.1, 0.5],  # Minimum gain to make a split
    #     'verbose': [0]  # Suppress output
    # }
    # "LGBMRegressor": {
    #     'n_estimators': [730],
    #     'learning_rate': [0.03],
    #     'max_depth': [3],
    #     'num_leaves': [5],
    #     'min_child_samples': [11],
    #     'colsample_bytree': [0.6],
    #     'reg_alpha': [1],
    #     'verbose': [0]
    # },
    # "CatBoostRegressor": {
    #     'iterations': [50, 100, 150, 200, 300, 400],
    #     'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.15, 0.2],
    #     'depth': [3, 5, 8],
    #     'verbose': [False]
    # },
    # "CatBoostRegressor": {
    #     'iterations': [500],
    #     'learning_rate': [0.03],
    #     'depth': [5],
    #     'verbose': [False]
    # },
    # "ElasticNet": {
    #     'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
    #     'l1_ratio': [0.1, 0.5, 0.9]
    # },
    # "ElasticNet": {
    #     'alpha': [0.002],
    #     'l1_ratio': [0.9]
    # },
    "HuberRegressor": {
        'alpha': [0.01, 0.1, 1, 10, 100]
    },
    # "Lasso": {
    #     'alpha': [0.01, 0.1, 1, 10, 100]
    # },
    # "LinearRegression": {
    #     # No hyperparameters to tune for linear regression
    # },
    # "OrthogonalMatchingPursuit": {
    #     'n_nonzero_coefs': [None, 5, 10, 20, 30]
    # },
    # "Ridge": {
    #     'alpha': [0.01, 0.1, 1, 10, 100]
    # },
    # "RANSACRegressor": {
    #     'min_samples': [0.1, 0.5, 0.9],
    #     'residual_threshold': [1.0, 2.0, 3.0]
    # },
    # "TheilSenRegressor": {
    #     'max_subpopulation': [1e3, 1e4, 1e5]
    # },
    # "GaussianProcessRegressor": {
    #     'alpha': [1e-2, 1e-3, 1e-10]
    # },
    # "KernelRidge": {
    #     'alpha': [0.01, 0.1, 1, 10, 100],
    #     'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    # },
    # "KNeighborsRegressor": {
    #     'n_neighbors': [3, 5, 7, 9, 11],
    #     'weights': ['uniform', 'distance']
    # },
    # "SVR": {
    #     'C': [0.1, 1, 10, 100],
    #     'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    # },
    # "DecisionTreeRegressor": {
    #     'max_depth': [3, 5, 8, None],
    #     'min_samples_split': [2, 5, 10],
    #     'min_samples_leaf': [1, 2, 4]
    # },
    # "ExtraTreeRegressor": {
    #     'max_depth': [3, 5, 8, None],
    #     'min_samples_split': [2, 5, 10],
    #     'min_samples_leaf': [1, 2, 4]
    # },
    # "MLPRegressor": {
    #     'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    #     'activation': ['relu', 'tanh'],
    #     'solver': ['adam', 'sgd'],
    #     'learning_rate': ['constant', 'invscaling', 'adaptive']
    # },
    # "make_pipeline(PolynomialFeatures(degree=2), LinearRegression)": {
    #     # No hyperparameters to tune for this pipeline
    # },
    # "PLSRegression": {
    #     'n_components': [2, 4, 6, 8, 10]
    # }
}

In [881]:
# Splitting data for cross-validation using KFold splitting
kf = KFold(n_splits=10, shuffle=True, random_state=619)

In [882]:
# Create a table to store results of GridSearchCV
results_columns = ['Model', 'Best Params', 'Best Score', 'Best Model', 'Refit Time']
results_df = pd.DataFrame(columns=results_columns)

In [883]:
# Perform hyperparameter tuning for each model and store results row-wise
total_start_time = time.time()

index = 0

for model_name, param_grid in param_grids.items():

    print(f"Fitting model: {model_name}...")

    # Instantiate model from string name
    if model_name == "make_pipeline(PolynomialFeatures(degree=2), LinearRegression)":
        model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
    else:
        model = eval(model_name)(random_state=619)

    start_time = time.time()
    search = GridSearchCV(model, param_grid, scoring=make_scorer(mean_squared_error, squared=False), cv=kf, verbose=1, n_jobs=-1)
    search.fit(df_train_copy[features], df_train_copy["SalePrice"])
    end_time = time.time()
    
    # Store results
    results_df.loc[index, 'Model'] = model.__class__.__name__
    results_df.loc[index, 'Best Params'] = str(search.best_params_)
    results_df.loc[index, 'Best Model'] = str(search.best_estimator_)
    results_df.loc[index, 'Best Score'] = search.best_score_
    results_df.loc[index, 'Refit Time'] = search.refit_time_
    results_df.loc[index, 'Total Model Grid Search Tuning Time'] = end_time - start_time

    index += 1

total_end_time = time.time()

total_run_time = total_end_time - total_start_time
print('Total time taken by Hyperparameter tuning is {:.2f} minutes.'.format(total_run_time / 60))

# Print the model results in ascending order to get the best performing model
results_df.sort_values('Best Score', ascending=True, inplace=True)
display(results_df)

Fitting model: ElasticNet...
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Total time taken by Hyperparameter tuning is 0.00 minutes.


Unnamed: 0,Model,Best Params,Best Score,Best Model,Refit Time,Total Model Grid Search Tuning Time
0,ElasticNet,"{'alpha': 0.002, 'l1_ratio': 0.9}",0.35462,"ElasticNet(alpha=0.002, l1_ratio=0.9, random_s...",0.003995,0.045829


In [884]:
pd.set_option('display.max_colwidth', None)

results = search.cv_results_
results_df = pd.DataFrame({
    'params': results['params'],
    'mean_test_score': results['mean_test_score'],
    'std_test_score': results['std_test_score']
})
display(results_df.sort_values("mean_test_score"))

pd.reset_option('display.max_colwidth')

Unnamed: 0,params,mean_test_score,std_test_score
0,"{'alpha': 0.002, 'l1_ratio': 0.9}",0.35462,0.035176


In [885]:
# Plotting the results
plt.figure(figsize=(12, 8))
sns.barplot(x='Best Score', y='Model', data=results_df, color="#097969")

plt.title('MACHINE LEARNING ALGORITHM BEST RMSE SCORE After Hyperparameter Tuning: \n')
plt.xlabel('Best RMSE Score')
plt.ylabel('Algorithm')
plt.show()

ValueError: Could not interpret value `Best Score` for `x`. An entry with this name does not appear in `data`.

<Figure size 1200x800 with 0 Axes>