In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from src.utils import *

In [2]:
DATASET_1_TRAIN = "data/train/baseline_train.csv"
DATASET_1_VAL = "data/train/baseline_val.csv"

DATASET_2_TRAIN = "data/train/baseline-w-feature-eng_train.csv"
DATASET_2_VAL = "data/train/baseline-w-feature-eng_val.csv"

DATASET_3_TRAIN = "data/train/baseline-truncated_train.csv"
DATASET_3_VAL = "data/train/baseline-truncated_val.csv"

DATASET_4_TRAIN = "data/train/truncated-feat-eng_train.csv"
DATASET_4_VAL = "data/train/truncated-feat-eng_train.csv"

In [3]:
TRAIN_DATA = DATASET_4_TRAIN
VAL_DATA = DATASET_4_VAL

In [4]:
parameters = dict()
parameters['n_estimators'] = [10, 50, 100, 500]
parameters['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
parameters['subsample'] = [0.5, 0.7, 1.0]
parameters['max_depth'] = [3, 7, 9]

In [5]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

X_train, y_train = split_features_and_monthly_rent_label(train_set)
X_test, y_test = split_features_and_monthly_rent_label(val_set)

In [6]:
model = GradientBoostingRegressor()
cv = KFold(n_splits=10)
grid_search = HalvingGridSearchCV(estimator=model, param_grid=parameters, n_jobs=-1, cv=cv, scoring='neg_root_mean_squared_error', verbose=3)
result = grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 666
max_resources_: 54000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 180
n_resources: 666
Fitting 10 folds for each of 180 candidates, totalling 1800 fits
[CV 2/10] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.5;, score=(train=-703.997, test=-638.283) total time=   0.0s
[CV 3/10] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.5;, score=(train=-700.154, test=-795.189) total time=   0.0s
[CV 1/10] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.5;, score=(train=-708.825, test=-753.294) total time=   0.0s
[CV 9/10] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.5;, score=(train=-735.330, test=-654.102) total time=   0.0s
[CV 4/10] END learning_rate=0.0001, max_depth=3, n_estimators=10, subsample=0.5;, score=(train=-726.652, test=-728.638) total time=   0.0s
[CV 1/10] END learning_rate=0.0001, ma

In [7]:
grid_search_results = pd.DataFrame(result.cv_results_)
grid_search_results.describe()

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,...,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,0.492593,2397.6,0.731792,0.028337,0.002552,0.001115,-632.12764,-619.600758,-664.184368,-634.800956,...,-405.2329,-411.3735,-410.1787,-395.4666,-392.0169,-406.3142,-415.3103,-406.786634,-405.085412,11.286092
std,0.830383,6241.666236,1.381653,0.062958,0.002676,0.001147,191.002404,224.069754,189.065329,200.923474,...,236.694,245.0712,237.0083,231.8711,228.8984,237.2214,248.5167,230.277625,237.076076,8.467109
min,0.0,666.0,0.009054,0.00087,0.000812,9e-06,-2446.352411,-2290.260493,-1871.464166,-1752.268109,...,-700.1543,-726.6541,-703.6231,-681.2726,-675.091,-703.1115,-735.3303,-690.440993,-702.848984,0.608087
25%,0.0,666.0,0.064467,0.007088,0.001192,0.000204,-731.532209,-635.85177,-777.286778,-709.349439,...,-650.2882,-672.712,-655.42,-632.8051,-627.5716,-653.5992,-681.3248,-644.927523,-652.933828,5.944188
50%,0.0,666.0,0.184377,0.013166,0.001565,0.000831,-566.45226,-557.189897,-615.487781,-584.904752,...,-463.6085,-456.4179,-460.2356,-459.1083,-447.5859,-447.2835,-451.7355,-461.476438,-453.754579,9.981323
75%,1.0,1998.0,0.720203,0.030707,0.002642,0.00155,-519.00407,-515.952639,-530.185971,-514.750292,...,-197.1773,-195.1888,-199.4137,-184.6694,-193.0259,-195.6786,-190.7855,-198.688559,-192.69758,15.830913
max,4.0,53946.0,9.291524,0.695867,0.019956,0.006855,-436.648641,-474.624652,-468.173663,-468.835227,...,-1.330529e-08,-1.383728e-08,-1.384624e-08,-1.356252e-08,-1.35301e-08,-1.355654e-08,-1.373227e-08,-1.756163,-1.589038,100.095012


In [8]:
print(result.best_params_)
print(result.best_score_)

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
-490.81988592003444


In [52]:
import joblib
joblib.dump(result, 'models/approach4_gb.pkl')

['approach4_gb.pkl']

In [53]:
grid_search_model = joblib.load('models/approach4_gb.pkl')
grid_search_model.best_score_

-490.9728404556175

| Approach | Best RMSE | Best Params |
|:--------|:--------|:--------|
|1|487.26|learning_rate: 0.01, max_depth: 7, n_estimators: 500, subsample: 0.5|
|2|490.85|learning_rate: 0.1, max_depth: 3, n_estimators: 100, subsample: 0.5|
|3|491.55|learning_rate: 0.1, max_depth: 3, n_estimators: 100, subsample: 0.7|
|4|490.97|learning_rate: 0.1, max_depth: 3, n_estimators: 100, subsample: 0.5|