In [6]:

import optuna
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Load the Boston House Prices dataset
boston = load_boston()

# Create feature and target arrays
X = boston.data
y = boston.target

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# KFold cross-validation
kf = KFold(n_splits=3)

def objective_xgboost(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
    max_depth = trial.suggest_int("max_depth", 5, 20)
    n_estimators = trial.suggest_int("n_estimators", 100, 200)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 6)
    
    model = XGBRegressor(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators, 
                          min_child_weight=min_child_weight, random_state=42)
    
    model.fit(X_train, y_train)
    
    # Compute the R2 score on the training set
    r2_train = r2_score(y_train, model.predict(X_train))
    
    # Compute the R2 score on the test set
    r2_test = r2_score(y_test, model.predict(X_test))
    
    # Compute cross-validated R2
    r2_cross_val = cross_val_score(model, X_train, y_train, cv=kf, scoring="r2").mean()
    
    # Set additional attributes to trial
    trial.set_user_attr("r2_train", r2_train)
    trial.set_user_attr("r2_test", r2_test)
    trial.set_user_attr("r2_cross_val", r2_cross_val)
    
    return r2_cross_val


def objective_lasso(trial):
    # Suggest hyperparameters
    alpha = trial.suggest_float("alpha", 0.0005, 1.0)
    
    model = Pipeline(steps=[('Scaler', StandardScaler()), ('Lasso', Lasso(alpha=alpha))])
    
    model.fit(X_train, y_train)
    
    # Compute the R2 score on the training set
    r2_train = r2_score(y_train, model.predict(X_train))
    
    # Compute the R2 score on the test set
    r2_test = r2_score(y_test, model.predict(X_test))
    
    # Compute cross-validated R2
    r2_cross_val = cross_val_score(model, X_train, y_train, cv=kf, scoring="r2").mean()
    
    # Set additional attributes to trial
    trial.set_user_attr("r2_train", r2_train)
    trial.set_user_attr("r2_test", r2_test)
    trial.set_user_attr("r2_cross_val", r2_cross_val)
    
    return r2_cross_val


save_dic= {}

modellabels= ['xgboost', 'lasso']


for modellabel in modellabels:
    if modellabel =='xgboost':
        objective= objective_xgboost
    elif modellabel=='lasso':
        objective= objective_lasso

    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=10)

    # Print the result
    best_trials = study.get_trials(deepcopy=False, states=[optuna.trial.TrialState.COMPLETE])[:5]

    # Collecting the results to a dataframe
    results_df = pd.DataFrame()
    for i, trial in enumerate(best_trials):
        results_df.loc[i, 'Trial'] = trial.number
        results_df.loc[i, 'R2_Train'] = trial.user_attrs['r2_train']
        results_df.loc[i, 'R2_Test'] = trial.user_attrs['r2_test']
        results_df.loc[i, 'R2_Cross_Val'] = trial.user_attrs['r2_cross_val']
        results_df.loc[i, 'Hyperparameters'] = str(trial.params)
        
        
    savedic[modellabel]= results
    
    

    

    


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

NameError: name 'results' is not defined

In [4]:
results_df

Unnamed: 0,Trial,R2_Train,R2_Test,R2_Cross_Val,Hyperparameters
0,0.0,0.999584,0.910244,0.830351,"{'learning_rate': 0.07936530887273445, 'max_de..."
1,1.0,0.935351,0.814825,0.797208,"{'learning_rate': 0.01899459170010953, 'max_de..."
2,2.0,0.999387,0.9075,0.828603,"{'learning_rate': 0.05070068778199874, 'max_de..."
3,3.0,0.999968,0.912586,0.835432,"{'learning_rate': 0.0900094357902949, 'max_dep..."
4,4.0,0.988134,0.876857,0.826545,"{'learning_rate': 0.025679418442013872, 'max_d..."
