## Hyperparameter tuning comparison using GridSearch vs Optuna

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import time
import optuna
from IPython.display import display, HTML

In [2]:
# Load Boston Housing dataset from external source
df = pd.read_csv(
    filepath_or_buffer="http://lib.stat.cmu.edu/datasets/boston",
    delim_whitespace=True,
    skiprows=21,
    header=None,
)

columns = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV',
]

# Flatten all the values into a single long list and remove the nulls
values_w_nulls = df.values.flatten()
all_values = values_w_nulls[~np.isnan(values_w_nulls)]

# Reshape the values to have 14 columns and make a new df out of them
df = pd.DataFrame(
    data=all_values.reshape(-1, len(columns)),
    columns=columns,
)
df1 = df
boston = df

  df = pd.read_csv(


In [3]:
# Split data into features and target
X = boston.drop(columns=['MEDV'])
y = boston['MEDV']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Define models and parameter grids
models = {
    'RandomForest': (RandomForestRegressor(), {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, 10]
    }),
    'GradientBoosting': (GradientBoostingRegressor(), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    }),
    'XGBoost': (XGBRegressor(objective='reg:squarederror'), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    }),
    'SVR': (SVR(), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    })
}

In [5]:
# Perform GridSearch with cross-validation for each model and store only the best result
results = []
total_start_time = time.time()

best_model_overall = None
best_mse_overall = float('inf')
best_model_name = None
best_params_overall = None

for model_name, (model, param_grid) in models.items():
    # print(f"Running GridSearchCV for {model_name}...")
    start_time = time.time()
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    end_time = time.time()

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Calculate MSE on the test set
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    # Store the best model if it has the lowest MSE
    if mse < best_mse_overall:
        best_mse_overall = mse
        best_model_overall = best_model
        best_model_name = model_name
        best_params_overall = best_params

# Store the total runtime for GridSearch
total_end_time = time.time()
gridsearch_total_time = total_end_time - total_start_time

# Store the best GridSearch result
result = {
    'Dataset': 'Original',
    'Method': 'GridSearch',
    'Model': best_model_name,
    'Best Parameters': best_params_overall,
    'Total Runtime (s)': gridsearch_total_time
}

results.append(result)

print(result)

  _data = np.array(data, dtype=dtype, copy=copy,


{'Dataset': 'Original', 'Method': 'GridSearch', 'Model': 'GradientBoosting', 'Best Parameters': {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}, 'Total Runtime (s)': 84.89184999465942}


## Optuna Trial

In [6]:
# Define the objective function for Optuna
def objective(trial):
    model_name = trial.suggest_categorical('model', ['RandomForest', 'GradientBoosting', 'XGBoost', 'SVR'])
    
    if model_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    elif model_name == 'GradientBoosting':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    elif model_name == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, objective='reg:squarederror')
    elif model_name == 'SVR':
        C = trial.suggest_float('C', 0.1, 10)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
        model = SVR(C=C, kernel=kernel)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [7]:
# Run Optuna optimization
study_start_time = time.time()
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
study_end_time = time.time()

# Adding summary of the Optuna results
optuna_best_params = study.best_trial.params
optuna_best_value = study.best_trial.value
optuna_total_time = study_end_time - study_start_time

result = {
    'Dataset': 'Original',
    'Method': 'Optuna',
    'Model': study.best_trial.params.get('model'),
    'Best Parameters': optuna_best_params,
    'Total Runtime (s)': optuna_total_time
}
results.append(result)

print(result)

[I 2024-11-18 07:10:55,202] A new study created in memory with name: no-name-f9291360-86ef-476e-bfdb-9795f3a1048a
[I 2024-11-18 07:11:14,276] Trial 0 finished with value: 29.270383556564468 and parameters: {'model': 'SVR', 'C': 9.186658585371141, 'kernel': 'linear'}. Best is trial 0 with value: 29.270383556564468.
[I 2024-11-18 07:11:14,499] Trial 1 finished with value: 8.120490299958972 and parameters: {'model': 'XGBoost', 'n_estimators': 116, 'learning_rate': 0.1462678481400589, 'max_depth': 9}. Best is trial 1 with value: 8.120490299958972.
[I 2024-11-18 07:11:15,046] Trial 2 finished with value: 7.125223118012655 and parameters: {'model': 'GradientBoosting', 'n_estimators': 78, 'learning_rate': 0.19522132308761395, 'max_depth': 4}. Best is trial 2 with value: 7.125223118012655.
[I 2024-11-18 07:11:15,209] Trial 3 finished with value: 8.978960835071891 and parameters: {'model': 'RandomForest', 'n_estimators': 52, 'max_depth': 9}. Best is trial 2 with value: 7.125223118012655.
[I 202

{'Dataset': 'Original', 'Method': 'Optuna', 'Model': 'GradientBoosting', 'Best Parameters': {'model': 'GradientBoosting', 'n_estimators': 83, 'learning_rate': 0.1737313090963122, 'max_depth': 6}, 'Total Runtime (s)': 36.54901647567749}


### Using derived features

#### GridSearch

In [8]:
# Create new columns by multiplying each column with every other column

for i in range(len(X.columns)):
    for j in range(i, len(X.columns)):
        col1 = X.columns[i]
        col2 = X.columns[j]
        new_col_name = f"{col1}_{col2}"
        df1[new_col_name] = X[col1] * X[col2]

# Display the updated DataFrame
#print(df_extended)

X_der=df1.drop('MEDV', axis=1) #Remove the target column X
y_der=df1['MEDV']
len(X_der)
X_der.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,...,TAX_TAX,TAX_PTRATIO,TAX_B,TAX_LSTAT,PTRATIO_PTRATIO,PTRATIO_B,PTRATIO_LSTAT,B_B,B_LSTAT,LSTAT_LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,...,87616.0,4528.8,117482.4,1474.08,234.09,6072.57,76.194,157529.61,1976.562,24.8004
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,...,58564.0,4307.6,96049.8,2211.88,316.84,7064.82,162.692,157529.61,3627.666,83.5396
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,...,58564.0,4307.6,95064.86,975.26,316.84,6992.374,71.734,154315.4089,1583.1049,16.2409
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,...,49284.0,4151.4,87607.86,652.68,349.69,7379.581,54.978,155732.8369,1160.2122,8.6436
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,...,49284.0,4151.4,88111.8,1183.26,349.69,7422.03,99.671,157529.61,2115.477,28.4089


In [9]:
X_train_der, X_test_der, y_train_der, y_test_der = train_test_split(X_der, y_der, test_size=0.2, random_state=42)

In [10]:
# Define models and parameter grids
models = {
    'RandomForest': (RandomForestRegressor(), {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, 10]
    }),
    'GradientBoosting': (GradientBoostingRegressor(), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    }),
    'XGBoost': (XGBRegressor(objective='reg:squarederror'), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    }),
    'SVR': (SVR(), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    })
}

In [11]:
# Perform GridSearch with cross-validation for each model and store only the best result
results_mod = []
total_start_time = time.time()

best_model_overall = None
best_mse_overall = float('inf')
best_model_name = None
best_params_overall = None

for model_name, (model, param_grid) in models.items():
    # print(f"Running GridSearchCV for {model_name}...")
    start_time = time.time()
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_der, y_train_der)
    end_time = time.time()

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Calculate MSE on the test set
    y_pred_der = best_model.predict(X_test_der)
    mse = mean_squared_error(y_test_der, y_pred_der)

    # Store the best model if it has the lowest MSE
    if mse < best_mse_overall:
        best_mse_overall = mse
        best_model_overall = best_model
        best_model_name = model_name
        best_params_overall = best_params

# Store the total runtime for GridSearch
total_end_time = time.time()
gridsearch_total_time = total_end_time - total_start_time

# Store the best GridSearch result
result = {
    'Dataset': 'Modified',
    'Method': 'GridSearch',
    'Model': best_model_name,
    'Best Parameters': best_params_overall,
    'Total Runtime (s)': gridsearch_total_time
}
results_mod.append(result)

print(result)

{'Dataset': 'Modified', 'Method': 'GridSearch', 'Model': 'XGBoost', 'Best Parameters': {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}, 'Total Runtime (s)': 3195.9918394088745}


#### Optuna

In [12]:
# Define the objective function for Optuna
def objective(trial):
    model_name = trial.suggest_categorical('model', ['RandomForest', 'GradientBoosting', 'XGBoost', 'SVR'])
    
    if model_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    elif model_name == 'GradientBoosting':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    elif model_name == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, objective='reg:squarederror')
    elif model_name == 'SVR':
        C = trial.suggest_float('C', 0.1, 10)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
        model = SVR(C=C, kernel=kernel)
    
    model.fit(X_train_der, y_train_der)
    y_pred_der = model.predict(X_test_der)
    mse = mean_squared_error(y_test_der, y_pred_der)
    return mse


In [13]:
# Run Optuna optimization
study_start_time = time.time()
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
study_end_time = time.time()

# Adding summary of the Optuna results
optuna_best_params = study.best_trial.params
optuna_best_value = study.best_trial.value
optuna_total_time = study_end_time - study_start_time

result = {
    'Dataset': 'Modified',
    'Method': 'Optuna',
    'Model': study.best_trial.params.get('model'),
    'Best Parameters': optuna_best_params,
    'Total Runtime (s)': optuna_total_time
}
results_mod.append(result)

print(result)

[I 2024-11-18 08:04:47,862] A new study created in memory with name: no-name-6bd2bc17-2a71-4327-896e-9c19f3f754a9
[I 2024-11-18 08:04:50,209] Trial 0 finished with value: 8.840921382022634 and parameters: {'model': 'GradientBoosting', 'n_estimators': 85, 'learning_rate': 0.12397779867883464, 'max_depth': 8}. Best is trial 0 with value: 8.840921382022634.
[I 2024-11-18 08:04:53,339] Trial 1 finished with value: 8.160631144037334 and parameters: {'model': 'GradientBoosting', 'n_estimators': 101, 'learning_rate': 0.050527551034137835, 'max_depth': 10}. Best is trial 1 with value: 8.160631144037334.
[I 2024-11-18 08:04:54,376] Trial 2 finished with value: 7.444770467778865 and parameters: {'model': 'XGBoost', 'n_estimators': 193, 'learning_rate': 0.11558562059171962, 'max_depth': 5}. Best is trial 2 with value: 7.444770467778865.
[I 2024-11-18 08:04:56,083] Trial 3 finished with value: 6.345706866929741 and parameters: {'model': 'XGBoost', 'n_estimators': 160, 'learning_rate': 0.1490061185

{'Dataset': 'Modified', 'Method': 'Optuna', 'Model': 'XGBoost', 'Best Parameters': {'model': 'XGBoost', 'n_estimators': 176, 'learning_rate': 0.18228264119027715, 'max_depth': 10}, 'Total Runtime (s)': 1156.8010449409485}


In [14]:
# # Create a summary dataframe and display
# summary = pd.concat(results_orig, results_mod)

# summary_df = pd.DataFrame(results)
# html_table = summary_df.to_html()

# display(HTML(html_table))

In [17]:
# Create a summary dataframe by concatenating the original and modified results
results_df = pd.DataFrame(results)
results_mod_df = pd.DataFrame(results_mod)

summary = pd.concat([results_df, results_mod_df], ignore_index=True)

# Convert to an HTML table and display
html_table = summary.to_html()
display(HTML(html_table))

Unnamed: 0,Dataset,Method,Model,Best Parameters,Total Runtime (s)
0,Original,GridSearch,GradientBoosting,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}",84.89185
1,Original,Optuna,GradientBoosting,"{'model': 'GradientBoosting', 'n_estimators': 83, 'learning_rate': 0.1737313090963122, 'max_depth': 6}",36.549016
2,Modified,GridSearch,XGBoost,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}",3195.991839
3,Modified,Optuna,XGBoost,"{'model': 'XGBoost', 'n_estimators': 176, 'learning_rate': 0.18228264119027715, 'max_depth': 10}",1156.801045
