<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Random-Forest-Modelling" data-toc-modified-id="Random-Forest-Modelling-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Random Forest Modelling</a></span></li><li><span><a href="#Grid-Search" data-toc-modified-id="Grid-Search-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Grid Search</a></span></li><li><span><a href="#Use-the-best-parameters-from-grid-search" data-toc-modified-id="Use-the-best-parameters-from-grid-search-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Use the best parameters from grid search</a></span></li><li><span><a href="#Randomized-search" data-toc-modified-id="Randomized-search-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Randomized search</a></span></li><li><span><a href="#Feature-Importance" data-toc-modified-id="Feature-Importance-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Feature Importance</a></span></li><li><span><a href="#Feature-Importance-using-eli5" data-toc-modified-id="Feature-Importance-using-eli5-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Feature Importance using eli5</a></span></li></ul></div>

# Imports

In [1]:
import numpy as np
import pandas as pd

import os
import time
import collections
import itertools

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [2]:
def adjustedR2(rsquared,nrows,kcols):
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)


def multiple_linear_regression(df,features,target,model,
                               verbose=1,cv=5,test_size=0.3):
    """ Multiple Linear Regression Modelling using given model.
    
    Depends:
    Depends on function adjusted r-squared.
    
    
    Returns:
    rmse, r2_train, ar2_train, r2_test, ar2_test, cv
    """
    
    # train test split
    train, test = train_test_split(df, test_size=0.2, random_state=100)

    # train test values
    X = df[features].values
    y = df[target].values.ravel()

    Xtrain = train[features].values
    ytrain = train[target].values.ravel()

    Xtest = test[features].values
    ytest = test[target].values.ravel()
    
    # fitting
    model.fit(Xtrain,ytrain)

    # prediction
    ypreds = model.predict(Xtest)

    # metrics
    rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
    r2_train = model.score(Xtrain, ytrain).round(3)
    r2_test = model.score(Xtest, ytest).round(3)

    cv = cross_val_score(model, X, y, cv=5,n_jobs=-1,
                         verbose=verbose).mean().round(3)

    ar2_train = adjustedR2(model.score(Xtrain,ytrain),
                           Xtrain.shape[0],
                           len(features)).round(3)
    ar2_test  = adjustedR2(model.score(Xtest,ytest),
                           Xtest.shape[0] ,
                           len(features)).round(3)
    
    return (rmse, r2_train, ar2_train, r2_test, ar2_test, cv)


df_eval = pd.DataFrame({'Model': [],
                           'Details':[],
                           'Root Mean Squared Error (RMSE)':[],
                           'R-squared (training)':[],
                           'Adjusted R-squared (training)':[],
                           'R-squared (test)':[],
                           'Adjusted R-squared (test)':[],
                           '5-Fold Cross Validation':[]})

# Load the data

In [3]:
df = pd.read_csv('../data/raw/kc_house_data.csv')
df.drop(['id','date'],axis=1,inplace=True)
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


# Random Forest Modelling

In [4]:
target = ['price']
features = df.drop(['price'],axis=1).columns

In [5]:
model = RandomForestRegressor(n_estimators= 50,random_state=100)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(df, features, target,model,
                               verbose=0,test_size=0.2)


df_eval.loc[len(df_eval)] = ['Random Forest Regressor',
                             '', rmse,r2_train,ar2_train,
                             r2_test,ar2_test,cv]


df_eval

Unnamed: 0,Model,Details,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test),5-Fold Cross Validation
0,Random Forest Regressor,,125316.711,0.981,0.981,0.883,0.883,0.872


# Grid Search

Most important hyperparameters of Random Forest:

- n_estimators = n of trees
- max_features = max number of features considered for splitting a node
- max_depth = max number of levels in each decision tree
- min_samples_split = min number of data points placed in a node before the node is split
- min_samples_leaf = min number of data points allowed in a leaf node
- bootstrap = method for sampling data points (with or without replacement)

In [7]:
# train test split
train, test = train_test_split(df, test_size=0.2, random_state=100)

# train test values
X = df[features].values
y = df[target].values.ravel()

Xtrain = train[features].values
ytrain = train[target].values.ravel()

Xtest = test[features].values
ytest = test[target].values.ravel()

In [15]:
from sklearn.model_selection import GridSearchCV

model = RandomForestRegressor()


param_grid = [
{'n_estimators': np.arange(20,60,20),
 'max_features': [2, len(features)], 
 'max_depth': [10, 50, None],
 'bootstrap': [True, False]}
]

grid_search_forest = GridSearchCV(model,
                                  param_grid,
                                  cv=5,
                                  n_jobs=-1,
                                  scoring='neg_mean_squared_error',
                                  verbose=2)

grid_search_forest.fit(Xtrain, ytrain)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'bootstrap': [True, False],
    

# Use the best parameters from grid search

In [17]:
grid_search_forest.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
                      max_features=18, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [18]:
grid_search_forest.best_params_

{'bootstrap': True, 'max_depth': 50, 'max_features': 18, 'n_estimators': 40}

In [20]:
model = RandomForestRegressor(n_estimators= 40,random_state=100, max_features=18,
                             max_depth=50, bootstrap=True)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(df, features, target,model,
                               verbose=0,test_size=0.2)


df_eval.loc[len(df_eval)] = ['Random Forest Regressor after grid search',
                             '', rmse,r2_train,ar2_train,
                             r2_test,ar2_test,cv]


df_eval.sort_values('Adjusted R-squared (test)')

Unnamed: 0,Model,Details,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test),5-Fold Cross Validation
0,Random Forest Regressor,,125316.711,0.981,0.981,0.883,0.883,0.872
1,Random Forest Regressor,,125118.269,0.981,0.981,0.884,0.883,0.872
2,Random Forest Regressor after grid search,,125118.269,0.981,0.981,0.884,0.883,0.872


# Randomized search

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in  forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)]

# max features
max_features = ['auto', 'sqrt']

# max depth of leaves
max_depth = [int(x) for x in np.linspace(1, 45, num = 3)]

# min samples split
min_samples_split = [5, 10]

# random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split}

pprint(random_grid)

{'max_depth': [1, 23, 45],
 'max_features': ['auto', 'sqrt'],
 'min_samples_split': [5, 10],
 'n_estimators': [20, 65, 110, 155, 200]}


In [24]:
rf_random = RandomizedSearchCV(estimator = model,
                               param_distributions = random_grid,
                               n_iter = 10,
                               cv = 5,
                               verbose=2,
                               random_state=100,
                               n_jobs = -1,
                               scoring='neg_mean_squared_error')
# Fit the random search model
rf_random.fit(Xtrain, ytrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   47.1s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=50,
                                                   max_features=18,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=40, n_jobs=None,
                                                   oob_score=False,
                                                   random_state=100, verbos

In [27]:
rf_random.best_params_

{'n_estimators': 155,
 'min_samples_split': 10,
 'max_features': 'auto',
 'max_depth': 23}

In [28]:
model = RandomForestRegressor(n_estimators= 155,
                              random_state=100,
                              max_features='auto',
                              max_depth=23,
                              min_samples_split=10)

rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(df, features, target,model,
                               verbose=0,test_size=0.2)


df_eval.loc[len(df_eval)] = ['Random Forest Regressor after grid search',
                             '', rmse,r2_train,ar2_train,
                             r2_test,ar2_test,cv]


df_eval.sort_values('Adjusted R-squared (test)')

Unnamed: 0,Model,Details,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test),5-Fold Cross Validation
3,Random Forest Regressor after grid search,,126117.377,0.96,0.96,0.882,0.882,0.874
0,Random Forest Regressor,,125316.711,0.981,0.981,0.883,0.883,0.872
1,Random Forest Regressor,,125118.269,0.981,0.981,0.884,0.883,0.872
2,Random Forest Regressor after grid search,,125118.269,0.981,0.981,0.884,0.883,0.872


# Feature Importance

In [30]:
importances = rf_random.best_estimator_.feature_importances_
importances

array([0.00200655, 0.00723102, 0.25640031, 0.01110655, 0.00134919,
       0.03538765, 0.01053924, 0.00243511, 0.33929916, 0.01984126,
       0.00352932, 0.02640599, 0.00179098, 0.01368775, 0.16003191,
       0.06935918, 0.0288653 , 0.01073352])

In [35]:
df_imp = pd.DataFrame({'feature': features,
                      'importance': importances})

df_imp.sort_values('importance', ascending=False).style.background_gradient(subset=['importance'])

Unnamed: 0,feature,importance
8,grade,0.339299
2,sqft_living,0.2564
14,lat,0.160032
15,long,0.0693592
5,waterfront,0.0353877
16,sqft_living15,0.0288653
11,yr_built,0.026406
9,sqft_above,0.0198413
13,zipcode,0.0136878
3,sqft_lot,0.0111065


# Feature Importance using eli5

In [32]:
import eli5
from eli5.sklearn import PermutationImportance
from eli5 import show_prediction

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [42]:
perm = PermutationImportance(model, random_state=1).fit(Xtest, ytest)

eli5.show_weights(perm, feature_names = features.values.tolist())

Weight,Feature
0.3622  ± 0.0170,lat
0.2656  ± 0.0125,sqft_living
0.2399  ± 0.0115,grade
0.1727  ± 0.0040,long
0.0285  ± 0.0042,yr_built
0.0276  ± 0.0022,waterfront
0.0233  ± 0.0030,sqft_living15
0.0134  ± 0.0020,zipcode
0.0089  ± 0.0009,sqft_above
0.0068  ± 0.0010,view


In [45]:
eli5.explain_weights_df(perm, feature_names=features.values.tolist())\
.style.background_gradient(subset=['weight'])

Unnamed: 0,feature,weight,std
0,lat,0.362158,0.00848396
1,sqft_living,0.265606,0.00623612
2,grade,0.239867,0.00575495
3,long,0.172746,0.0020204
4,yr_built,0.0284578,0.0021062
5,waterfront,0.0275578,0.00107751
6,sqft_living15,0.0233414,0.00147996
7,zipcode,0.0134185,0.000976738
8,sqft_above,0.00885196,0.000448446
9,view,0.00684043,0.000506956


In [49]:
test.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
19836,285000.0,3,2.5,2437,5136,2.0,0,0,3,7,2437,0,2011,0,98002,47.3517,-122.21,2437,4614
10442,239950.0,3,2.5,1560,4800,2.0,0,0,4,7,1560,0,1974,0,98001,47.2653,-122.285,1510,12240
20548,460000.0,3,2.5,2390,47480,2.0,0,0,3,9,2390,0,2007,0,98058,47.4517,-122.084,1720,44866
11014,397500.0,3,1.0,1480,5100,1.5,0,0,3,7,1480,0,1938,1959,98103,47.6915,-122.348,1300,5100
4138,545000.0,4,3.5,1880,1341,3.0,0,0,3,8,1650,230,2007,0,98122,47.6053,-122.306,1740,1883


In [54]:
eli5.show_prediction(model, test.iloc[0,1:],show_feature_values=True)

Contribution?,Feature,Value
539421.931,<BIAS>,1.0
78412.938,sqft_living,2437.0
10271.5,sqft_living15,2437.0
9725.071,yr_built,2011.0
3725.56,sqft_above,2437.0
129.459,floors,2.0
-31.142,sqft_basement,0.0
-36.586,yr_renovated,0.0
-76.167,condition,3.0
-411.935,bedrooms,3.0
