In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from utils import split_features_and_monthly_rent_label

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
# Datasets
BASELINE_TRAIN = "../data/train/baseline_train.csv"
BASELINE_VAL = "../data/train/baseline_val.csv"
BASELINE_TEST = "../data/test/baseline_test.csv"

BASELINE_W_FEAT_ENG_TRAIN = "../data/train/baseline-w-feature-eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "../data/train/baseline-w-feature-eng_val.csv"
BASELINE_W_FEAT_ENG_TEST = "../data/test/baseline-w-feature-eng_test.csv"

TRUNCATED_BASELINE_TRAIN = "../data/train/baseline-truncated_train.csv"
TRUNCATED_BASELINE_VAL = "../data/train/baseline-truncated_val.csv"
TRUNCATED_BASELINE_TEST = "../data/test/baseline-truncated_test.csv"

TRUNCATED_FEAT_ENG_TRAIN = "../data/train/truncated-feat-eng_train.csv"
TRUNCATED_FEAT_ENG_VAL = "../data/train/truncated-feat-eng_val.csv"
TRUNCATED_FEAT_ENG_TEST = "../data/test/truncated-feat-eng_test.csv"

### Train/Test

In [4]:
TRAIN_DATA = TRUNCATED_FEAT_ENG_TRAIN
VAL_DATA = TRUNCATED_FEAT_ENG_VAL

train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

X_train, y_train = split_features_and_monthly_rent_label(train_set)
X_test, y_test = split_features_and_monthly_rent_label(val_set)

In [6]:
model = KNeighborsRegressor()
pipeline = Pipeline(steps=[('model', model)])

# define grid search for hyperparameters
grid = {
    'model__n_neighbors': [1, 2, 32],
}
cv = KFold(n_splits=10)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, cv=cv, scoring='neg_root_mean_squared_error', verbose=3)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 18000
max_resources_: 54000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 3
n_resources: 18000
Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV 1/10] END model__n_neighbors=1;, score=(train=-52.324, test=-700.256) total time=   0.2s
[CV 2/10] END model__n_neighbors=1;, score=(train=-56.979, test=-713.597) total time=   0.3s
[CV 3/10] END model__n_neighbors=1;, score=(train=-57.766, test=-712.482) total time=   0.2s
[CV 4/10] END model__n_neighbors=1;, score=(train=-45.681, test=-705.462) total time=   0.2s
[CV 5/10] END model__n_neighbors=1;, score=(train=-54.669, test=-718.899) total time=   0.2s
[CV 6/10] END model__n_neighbors=1;, score=(train=-50.802, test=-740.394) total time=   0.2s
[CV 7/10] END model__n_neighbors=1;, score=(train=-56.759, test=-711.974) total time=   0.2s
[CV 8/10] END model__n_neighbors=1;, score=(train=-63.884, test=-712.742) total time

In [7]:
grid_results_df = pd.DataFrame(grid_results.cv_results_)
grid_results_df.describe()

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,0.25,27000.0,0.013528,0.001823,1.296584,0.101148,-603.413078,-608.877412,-607.407773,-602.632942,...,-366.314741,-364.917484,-367.768835,-364.78881,-365.314404,-370.815406,-363.942176,-366.770103,-366.020762,3.59259
std,0.5,18000.0,0.004628,0.001016,1.96037,0.131911,74.566683,81.903711,85.684832,80.040517,...,220.408182,226.608631,222.043225,222.842404,219.319891,218.511806,232.028256,223.492631,222.566359,2.874207
min,0.0,18000.0,0.010454,0.000779,0.222567,0.024323,-700.255906,-713.597109,-712.481969,-705.461827,...,-528.718283,-531.133377,-529.675366,-525.616007,-523.404562,-533.129164,-534.460671,-529.643139,-528.191363,0.473423
25%,0.0,18000.0,0.010724,0.001156,0.249552,0.02908,-642.351487,-654.177217,-659.691408,-646.221859,...,-523.299389,-523.870425,-523.603413,-522.417106,-521.226068,-524.280276,-524.684896,-523.216324,-522.90733,1.972595
50%,0.0,18000.0,0.011659,0.001708,0.36565,0.041007,-588.183772,-591.034224,-591.195963,-585.494704,...,-439.387463,-441.427919,-443.365326,-441.368513,-440.546935,-443.124202,-442.720668,-443.05687,-441.643116,3.316704
75%,0.25,27000.0,0.014463,0.002375,1.412682,0.113075,-549.245363,-545.734419,-538.912328,-541.905787,...,-282.402815,-282.474978,-287.530748,-283.740217,-284.635271,-289.659332,-281.977949,-286.610649,-284.756548,4.936699
max,1.0,54000.0,0.020338,0.003095,4.232469,0.298256,-537.028862,-539.844089,-534.757199,-534.080536,...,-57.765757,-45.680722,-54.66932,-50.802207,-56.759184,-63.884058,-35.866694,-51.323532,-52.605453,7.26353


In [8]:
print(grid_results.best_params_)
print(grid_results.best_score_)

{'model__n_neighbors': 32}
-538.0006512722879


In [5]:
joblib.dump(grid_results, '../models/approach4_ab.pkl')

NameError: name 'grid_results' is not defined

In [6]:
grid_search_model = joblib.load('../models/approach4_ab.pkl')
grid_search_model.best_score_

-503.8694178664199

| Approach | Best RMSE | Best Params |
|:---------|:----------|:------------|
|1|631.27|'model__n_neighbors': 32|
|2|538.09|'model__n_neighbors': 32|
|3|558.26|'model__n_neighbors': 32|
|4|538.00|'model__n_neighbors': 32|