# Stage 8: Models fine-tuning

In the previous stage, we selected three best models for the dataset:

Selected models:
- Ridge regression model
- Simple linear regression
- Random Forest (base model)

In this stage, we are going to optimize models' parameters to achieve the better score. 

In [1]:
%pylab inline

import pandas as pd
import numpy as np

Populating the interactive namespace from numpy and matplotlib


# Preparing the data

In [2]:
dataset = pd.read_csv('data/house_prices.csv').set_index('Id')
target_y = dataset['SalePrice']

In [3]:
#
# We've saved base data processing/cleaning and splitting routines from Stage #3 to the common_data.py module
#
from common_data import data_train_test_split, data_process_and_clean_advanced_features

### Cleaning the data using pre-saved data pipeline

In [4]:
processed_dataset = data_process_and_clean_advanced_features(dataset, target_y)

Preparing the dataset with feature engineering


In [5]:
processed_dataset.head()

Unnamed: 0_level_0,YrSold,BedroomAbvGr,HalfBath,GarageYrBlt,WoodDeckSF,FullBath,BsmtFinSF2,GrLivArea,YearRemodAdd,TotRmsAbvGrd,...,!CentralAir,!Electrical,!KitchenQual,!FireplaceQu,!GarageQual,!GarageCond,!PavedDrive,!HouseAge,!LandQuality,!ExretiorScore
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.605392,1.386294,0.693147,7.6029,0.0,1.098612,0.0,7.444833,7.6029,2.197225,...,1,1,0.75,0.0,0.5,0.5,0.75,5,4.2,0.929985
2,7.604894,1.386294,0.0,7.589336,5.700444,1.098612,0.0,7.141245,7.589336,1.94591,...,1,1,0.5,0.5,0.5,0.5,0.75,31,4.4,0.519958
3,7.605392,1.386294,0.693147,7.601902,0.0,1.098612,0.0,7.488294,7.602401,1.94591,...,1,1,0.75,0.5,0.5,0.5,0.75,7,3.95,0.929985
4,7.604396,1.386294,0.0,7.600402,0.0,0.693147,0.0,7.448916,7.586296,2.079442,...,1,1,0.75,0.75,0.5,0.5,0.75,91,4.05,0.538824
5,7.605392,1.609438,0.693147,7.601402,5.26269,1.098612,0.0,7.695758,7.601402,2.302585,...,1,1,0.75,0.5,0.5,0.5,0.75,8,4.15,0.929985


### Do train/test splitting

In [6]:
X_train, X_test, y_train, y_test = data_train_test_split(processed_dataset, target_y)

print(f"Train dataset length: {len(X_train):<5} ({len(X_train)/len(processed_dataset)*100:<2}%)")
print(f"Test  dataset length: {len(X_test):<5} ({len(X_test)/len(processed_dataset)*100:<2}%)")

Train dataset length: 1095  (75.0%)
Test  dataset length: 365   (25.0%)


# Tine-tuning of Top-3 selected models on the dataset

In [10]:
from common_models import model_base, calc_cross_val_scores
from sklearn.model_selection import GridSearchCV

# Random-Forest fine tuning

In [17]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor()

params = {
    'n_estimators': [10, 50, 100, 500],    
    "max_depth"         : [None, 10, 20],
    "min_samples_split" : [2, 10, 30] ,    
}

gs_rf = GridSearchCV(model_rf, params, scoring='neg_mean_squared_error', cv=10, n_jobs=-1);
gs_rf.fit(X_test, y_test);



GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 50, 100, 500], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 10, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

#### Random forest best parameters

In [20]:
gs_rf.best_params_

{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}

#### Random forest fine-tuned results

In [59]:
calc_cross_val_scores(gs_rf.best_estimator_, X_train, y_train)

[ 0.1833977   0.14014802  0.13606827  0.14290902  0.1738149   0.12067485
  0.16001325  0.16136104  0.13786819  0.17953466]
Mean: 0.15357898987224577
StDev: 0.020011698901411395


0.15357898987224577

## Ridge regression model

In [54]:
from sklearn.linear_model import Ridge, RidgeCV

model_ridge = Ridge(random_state=64)

In [47]:
params = {
    'alpha': (0.1, 1.0, 10.0, 20.0),
    'fit_intercept': [False, True],
    'normalize': [False, True],
}

gs_ridge = GridSearchCV(model_ridge, params, scoring='neg_mean_squared_error', cv=10, n_jobs=-1);
gs_ridge.fit(X_train, y_train);

#### Ridge regression best parameters

In [48]:
gs_ridge.best_params_

{'alpha': 10.0, 'fit_intercept': True, 'normalize': False}

#### Ridge regression fine-tuned best scores

In [49]:
calc_cross_val_scores(gs_ridge.best_estimator_, X_train, y_train)

[ 0.17514978  0.14554207  0.12733676  0.13784514  0.14925007  0.11614012
  0.14882291  0.15390988  0.12139653  0.17251344]
Mean: 0.14479067004411728
StDev: 0.01879126372853575


0.14479067004411728

# Simple linear regression

In [55]:
from sklearn.linear_model import LinearRegression

model_lin_reg = LinearRegression()

In [56]:
params = {
    'fit_intercept': [False, True],
    'normalize': [False, True],
}

gs_lin_reg = GridSearchCV(model_lin_reg, params, scoring='neg_mean_squared_error', cv=10, n_jobs=-1);
gs_lin_reg.fit(X_train, y_train);

#### Linear regression best parameters

In [57]:
gs_lin_reg.best_params_

{'fit_intercept': True, 'normalize': False}

#### Linear regression fine-tuned best scores

In [58]:
calc_cross_val_scores(gs_lin_reg.best_estimator_, X_train, y_train)

[ 0.17779686  0.14320863  0.1226069   0.13939943  0.15139162  0.13565912
  0.14321876  0.15422052  0.12368225  0.19589744]
Mean: 0.14870815381781669
StDev: 0.021761084253865765


0.14870815381781669