In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import xgboost as xgb
import optuna
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer

#### Objective
House price prediction

In [None]:
dataset = fetch_california_housing()

Preprocessing

In [None]:
df = pd.DataFrame(dataset.data).dropna()
df.columns = dataset.feature_names
df[dataset.target_names[0]] = dataset.target

cols_to_clean = ['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup']

# would look to cross validate threshold in practice
def detect_outliers(data, threshold=5):
    mean = np.mean(data)
    std = np.std(data)
    z_scores = np.abs((data - mean) / std)
    return np.where(z_scores > threshold)[0]

outliers = detect_outliers(df[cols_to_clean])
df = df.drop(outliers, axis=0)

df[cols_to_clean] = MinMaxScaler().fit_transform(df[cols_to_clean])

In [None]:
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
class Objective:
    def __init__(self, model):
        self.model = model

    def get_score(self):
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        self.score = cross_val_score(self.model, X_train, y_train, cv=kf, scoring=make_scorer(mean_squared_error)).mean()
        return self.score



def objective_linear(trial):
    params = {
        'degree': trial.suggest_int('degree', 1, 5)
    }
    model = Pipeline([
        ('poly', PolynomialFeatures(**params)),
        ('linear', LinearRegression(fit_intercept=False))
    ])
    return Objective(model).get_score()


def objective_xg(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        # 'max_depth': trial.suggest_int('max_depth', 3, 10),
        # 'subsample': trial.suggest_float('subsample', 0.5, 1),
        # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'lambda': trial.suggest_float('lambda', 1e-5, 1),
        'alpha': trial.suggest_float('alpha', 1e-5, 1),
        # 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    } # some hyperparameters removed due to error messages at runtime
    model = xgb.XGBRegressor(**params)
    return Objective(model).get_score()

def objective_svr(trial):
    params = {
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
        'C': trial.suggest_float('C', 0.1, 10),
        'epsilon': trial.suggest_float('epsilon', 0.01, 0.1),
    }
    model = SVR(**params)
    return Objective(model).get_score()

def objective_decision_reg(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'min_samples_split': trial.suggest_float('min_samples_split', 0.1, 1),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.1, 0.5),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None])
    }
    model = DecisionTreeRegressor(**params)
    return Objective(model).get_score()

def objective_rf_reg(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 100),
        'max_depth': trial.suggest_int('max_depth', 2, 32, log=True),
        'min_samples_split': trial.suggest_float('min_samples_split', 0.1, 1),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.1, 0.5),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None])
    }
    model = RandomForestRegressor(**params)
    return Objective(model).get_score()

In [None]:
err_rates = []

objective_funcs = [objective_linear, objective_xg, objective_decision_reg, objective_rf_reg]
# SVR removed due to slow training - would run on GPU if possible
# objective_funcs = [objective_linear, objective_xg, objective_svr, objective_decision_reg, objective_rf_reg]
for func in objective_funcs:
    study = optuna.create_study(direction='minimize')
    optuna.logging.set_verbosity(optuna.logging.CRITICAL)
    study.optimize(func, n_trials=20)
    err_rates.append({'Model': func.__name__, 'Best error': study.best_value, 'Best params': study.best_params})

In [None]:
df_err = pd.DataFrame(err_rates)

In [None]:
chosen_model = df_err.iloc[df_err['Best error'].idxmin()]

In [None]:
model = xgb.XGBRegressor(**chosen_model['Best params'])
model.fit(X_train, y_train)
model.save_model('0001.json')
y_pred_test = model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test)

In [None]:
df_results = X_test.copy()
df_results['MedVal'] = y_test
df_results['PredVal'] = y_pred_test

In [None]:
new_data = pd.DataFrame({
    'MedInc': [8.7038],
    'HouseAge': [10.0],
    'AveRooms': [5.037594],
    'AveBedrms': [5.982048],
    'Population': [1000.0],
    'AveOccup': [4.04739],
    'Latitude': [36.0],
    'Longitude': [-120.04]
})

prediction = model.predict(new_data)