In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import xgboost as xgb
import optuna
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer

#### Objective
House price prediction

In [38]:
dataset = fetch_california_housing()

In [5]:
df = pd.DataFrame(dataset.data)
df.columns = dataset.feature_names
df[dataset.target_names[0]] = dataset.target


In [39]:

df = df.dropna()

In [7]:
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [10]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
def objective_linear(trial):
    params = {
        'degree': trial.suggest_int('degree', 1, 5)
    }
    model = Pipeline([
        ('poly', PolynomialFeatures(**params)),
        ('linear', LinearRegression(fit_intercept=False))
    ])
    score = cross_val_score(model, X_train, y_train, cv=kf, scoring=make_scorer(mean_squared_error)).mean()
    return score

def objective_xg(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'lambda': trial.suggest_float('lambda', 1e-5, 1),
        'alpha': trial.suggest_float('alpha', 1e-5, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }
    model = xgb.XGBRegressor(**params)
    score = cross_val_score(model, X_train, y_train, cv=kf, scoring=make_scorer(mean_squared_error)).mean()
    return score

def objective_svr(trial):
    params = {
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
        'C': trial.suggest_float('C', 0.1, 10),
        'epsilon': trial.suggest_float('epsilon', 0.01, 0.1),
    }
    model = SVR(**params)
    score = cross_val_score(model, X_train, y_train, cv=kf, scoring=make_scorer(mean_squared_error)).mean()
    return score

def objective_decision_reg(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'min_samples_split': trial.suggest_float('min_samples_split', 0.1, 1),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.1, 0.5),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None])
    }
    model = DecisionTreeRegressor(**params)
    score = cross_val_score(model, X_train, y_train, cv=kf, scoring=make_scorer(mean_squared_error)).mean()
    return score

def objective_rf_reg(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 100),
        'max_depth': trial.suggest_int('max_depth', 2, 32, log=True),
        'min_samples_split': trial.suggest_float('min_samples_split', 0.1, 1),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.1, 0.5),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None])
    }
    model = RandomForestRegressor(**params)
    score = cross_val_score(model, X_train, y_train, cv=kf, scoring=make_scorer(mean_squared_error)).mean()
    return score

In [11]:
df_err = pd.DataFrame({'Model': [], 'Best error': [], 'Best params': []})

objective_funcs = [objective_linear, objective_xg, objective_decision_reg, objective_rf_reg]
# SVR removed due to slow training - would run on GPU if possible
# objective_funcs = [objective_linear, objective_xg, objective_svr, objective_decision_reg, objective_rf_reg]
for func in objective_funcs:
    study = optuna.create_study(direction='minimize')
    study.optimize(func, n_trials=20)
    df_err = df_err.append({'Model': func.__name__, 'Best error': study.best_value, 'Best params': study.best_params}, ignore_index=True)

[I 2023-12-27 16:39:08,378] A new study created in memory with name: no-name-54ac31d2-3cb5-4c37-ae94-5f076cbd3ada
[I 2023-12-27 16:39:21,516] Trial 0 finished with value: 80447209847.1964 and parameters: {'degree': 5}. Best is trial 0 with value: 80447209847.1964.
[I 2023-12-27 16:39:23,411] Trial 1 finished with value: 7697552590.333864 and parameters: {'degree': 4}. Best is trial 1 with value: 7697552590.333864.
[I 2023-12-27 16:39:26,000] Trial 2 finished with value: 7697552590.333864 and parameters: {'degree': 4}. Best is trial 1 with value: 7697552590.333864.
[I 2023-12-27 16:39:28,142] Trial 3 finished with value: 7697552590.333864 and parameters: {'degree': 4}. Best is trial 1 with value: 7697552590.333864.
[I 2023-12-27 16:39:31,083] Trial 4 finished with value: 7697552590.333864 and parameters: {'degree': 4}. Best is trial 1 with value: 7697552590.333864.
[I 2023-12-27 16:39:31,208] Trial 5 finished with value: 3.8365757187337417 and parameters: {'degree': 2}. Best is trial 5 

In [12]:
df_err

Unnamed: 0,Model,Best error,Best params
0,objective_linear,0.519327,{'degree': 1}
1,objective_xg,0.212584,"{'booster': 'dart', 'learning_rate': 0.0897756..."
2,objective_decision_reg,0.733025,"{'max_depth': 10, 'min_samples_split': 0.27473..."
3,objective_rf_reg,0.749203,"{'n_estimators': 97, 'max_depth': 14, 'min_sam..."


In [20]:
chosen_model = df_err.iloc[df_err['Best error'].idxmin()]
chosen_model

Model                                               objective_xg
Best error                                              0.212584
Best params    {'booster': 'dart', 'learning_rate': 0.0897756...
Name: 1, dtype: object

In [23]:
model = xgb.XGBRegressor(**chosen_model['Best params'])
model.fit(X_train, y_train)
model.save_model('0001.json')
y_pred_test = model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test)
rmse_test

0.2080348240466048

In [24]:
df_results = X_test.copy()
df_results['MedVal'] = y_test
df_results['PredVal'] = y_pred_test
df_results

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedVal,PredVal
20046,1.6812,25.0,4.192201,1.022284,1392.0,3.877437,36.06,-119.01,0.47700,0.486708
3024,2.5313,30.0,5.039384,1.193493,1565.0,2.679795,35.14,-119.46,0.45800,0.963284
15663,3.4801,52.0,3.977155,1.185877,1310.0,1.360332,37.80,-122.44,5.00001,5.078151
20484,5.7376,17.0,6.163636,1.020202,1705.0,3.444444,34.28,-118.72,2.18600,2.393430
9814,3.7250,34.0,5.492991,1.028037,1063.0,2.483645,36.62,-121.93,2.78000,2.168953
...,...,...,...,...,...,...,...,...,...,...
15362,4.6050,16.0,7.002212,1.066372,1351.0,2.988938,33.36,-117.22,2.63300,2.482697
16623,2.7266,28.0,6.131915,1.256738,1650.0,2.340426,35.36,-120.83,2.66800,1.767710
18086,9.2298,25.0,7.237676,0.947183,1585.0,2.790493,37.31,-122.05,5.00001,4.763497
2144,2.7850,36.0,5.289030,0.983122,1227.0,2.588608,36.77,-119.76,0.72300,0.699229


In [36]:
new_data = pd.DataFrame({
    'MedInc': [8.7038],
    'HouseAge': [10.0],
    'AveRooms': [5.037594],
    'AveBedrms': [5.982048],
    'Population': [1000.0],
    'AveOccup': [4.04739],
    'Latitude': [36.0],
    'Longitude': [-120.04]
})

prediction = model.predict(new_data)
prediction

array([2.7877772], dtype=float32)