In [7]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from src.utils import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
BASELINE_TEST = "data/test/baseline_test.csv"

BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline-w-feature-eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline-w-feature-eng_val.csv"
BASELINE_W_FEAT_ENG_TEST = "data/test/baseline-w-feature-eng_test.csv"

TRUNCATED_BASELINE_TRAIN = "data/train/baseline-truncated_train.csv"
TRUNCATED_BASELINE_VAL = "data/train/baseline-truncated_val.csv"
TRUNCATED_BASELINE_TEST = "data/test/baseline-truncated_test.csv"

TRUNCATED_FEAT_ENG_TRAIN = "data/train/truncated-feat-eng_train.csv"
TRUNCATED_FEAT_ENG_VAL = "data/train/truncated-feat-eng_val.csv"
TRUNCATED_FEAT_ENG_TEST = "data/test/truncated-feat-eng_test.csv"### Train/Test

### Train/Test

In [5]:
TRAIN_DATA = TRUNCATED_FEAT_ENG_TRAIN
VAL_DATA = TRUNCATED_FEAT_ENG_VAL

train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

X_train, y_train = split_features_and_monthly_rent_label(train_set)
X_test, y_test = split_features_and_monthly_rent_label(val_set)

In [8]:
model = LinearRegression()
pipeline = Pipeline(steps=[('model', model)])

# define grid search for hyperparameters
grid = {
    'model__fit_intercept': [True, False],
}
cv = KFold(n_splits=10)
grid_search = HalvingGridSearchCV(estimator=pipeline, param_grid=grid, cv=cv, scoring='neg_root_mean_squared_error', verbose=3)
# Execute the grid search
grid_results = grid_search.fit(X_train, y_train)

n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 54000
max_resources_: 54000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 2
n_resources: 54000
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV 1/10] END model__fit_intercept=True;, score=(train=-509.064, test=-514.546) total time=   0.1s
[CV 2/10] END model__fit_intercept=True;, score=(train=-508.243, test=-521.830) total time=   0.1s
[CV 3/10] END model__fit_intercept=True;, score=(train=-509.485, test=-510.735) total time=   0.1s
[CV 4/10] END model__fit_intercept=True;, score=(train=-509.891, test=-507.081) total time=   0.1s
[CV 5/10] END model__fit_intercept=True;, score=(train=-510.564, test=-500.954) total time=   0.1s
[CV 6/10] END model__fit_intercept=True;, score=(train=-509.835, test=-507.602) total time=   0.1s
[CV 7/10] END model__fit_intercept=True;, score=(train=-509.297, test=-512.389) total time=   0.1s
[CV 8/10] END model__fit_intercept=True;, 

In [9]:
grid_results_df = pd.DataFrame(grid_results.cv_results_)
grid_results_df.describe()

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,0.0,54000.0,0.090101,0.007344,0.003545,0.000981,-514.5457,-521.8301,-510.7346,-507.081,...,-509.4855,-509.8911,-510.564,-509.8355,-509.297,-509.8251,-509.9064,-509.5562,-509.5668,0.5863535
std,0.0,0.0,0.005559,0.003312,0.00047,0.000849,8.215729e-11,9.474616e-10,1.50126e-10,9.083927e-11,...,9.253438e-13,1.366608e-12,5.627211e-13,1.848941e-12,1.407377e-12,1.527386e-12,1.728825e-12,1.28622e-12,1.125442e-12,3.075811e-13
min,0.0,54000.0,0.08617,0.005002,0.003213,0.00038,-514.5457,-521.8301,-510.7346,-507.081,...,-509.4855,-509.8911,-510.564,-509.8355,-509.297,-509.8251,-509.9064,-509.5562,-509.5668,0.5863535
25%,0.0,54000.0,0.088136,0.006173,0.003379,0.00068,-514.5457,-521.8301,-510.7346,-507.081,...,-509.4855,-509.8911,-510.564,-509.8355,-509.297,-509.8251,-509.9064,-509.5562,-509.5668,0.5863535
50%,0.0,54000.0,0.090101,0.007344,0.003545,0.000981,-514.5457,-521.8301,-510.7346,-507.081,...,-509.4855,-509.8911,-510.564,-509.8355,-509.297,-509.8251,-509.9064,-509.5562,-509.5668,0.5863535
75%,0.0,54000.0,0.092067,0.008515,0.003711,0.001281,-514.5457,-521.8301,-510.7346,-507.081,...,-509.4855,-509.8911,-510.564,-509.8355,-509.297,-509.8251,-509.9064,-509.5562,-509.5668,0.5863535
max,0.0,54000.0,0.094032,0.009686,0.003877,0.001581,-514.5457,-521.8301,-510.7346,-507.081,...,-509.4855,-509.8911,-510.564,-509.8355,-509.297,-509.8251,-509.9064,-509.5562,-509.5668,0.5863535


In [10]:
print(grid_results.best_params_)
print(grid_results.best_score_)

{'model__fit_intercept': False}
-509.99858181201296


| Approach | Best RMSE | Best Params |
|:---------|:----------|:------------|
|1|504.79|'model__fit_intercept': False|
|2|504.81|'model__fit_intercept': False|
|3|509.98|'model__fit_intercept': False|
|4|510.00|'model__fit_intercept': False|