<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#NOTES" data-toc-modified-id="NOTES-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>NOTES</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Useful-Scripts" data-toc-modified-id="Useful-Scripts-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Useful Scripts</a></span></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Data-Processing" data-toc-modified-id="Data-Processing-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Data Processing</a></span><ul class="toc-item"><li><span><a href="#Sanity-Check" data-toc-modified-id="Sanity-Check-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Sanity Check</a></span></li><li><span><a href="#Drop-unwanted-columns" data-toc-modified-id="Drop-unwanted-columns-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Drop unwanted columns</a></span></li><li><span><a href="#Create-squared-columns" data-toc-modified-id="Create-squared-columns-6.3"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>Create squared columns</a></span></li><li><span><a href="#Train-test-split" data-toc-modified-id="Train-test-split-6.4"><span class="toc-item-num">6.4&nbsp;&nbsp;</span>Train test split</a></span></li><li><span><a href="#Scaling" data-toc-modified-id="Scaling-6.5"><span class="toc-item-num">6.5&nbsp;&nbsp;</span>Scaling</a></span></li></ul></li><li><span><a href="#Modelling:-Random-Forest" data-toc-modified-id="Modelling:-Random-Forest-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Modelling: Random Forest</a></span></li><li><span><a href="#Grid-Search" data-toc-modified-id="Grid-Search-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Grid Search</a></span></li><li><span><a href="#Randomized-search" data-toc-modified-id="Randomized-search-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Randomized search</a></span></li><li><span><a href="#Feature-Importance" data-toc-modified-id="Feature-Importance-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Feature Importance</a></span></li><li><span><a href="#Time-Taken" data-toc-modified-id="Time-Taken-11"><span class="toc-item-num">11&nbsp;&nbsp;</span>Time Taken</a></span></li></ul></div>

<div class="alert alert-block alert-success">
<b>Kernel Author:</b>  <br>
<a href="https://bhishanpdl.github.io/" , target="_blank">Bhishan Poudel,  Data Scientist, Ph.D Astrophysics</a> .
</div>

# NOTES
- Always fit on train data and then use fitted model on test data.
- If we have extremely large r-squared value, check for data leakage. e.g. 'log_price' columns

# Imports

In [4]:
import time
time_start_notebook = time.time()

In [5]:
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install scikit-plot
    !pip install lrcurve
    !pip install watermark
    !pip install -U scikit-learn

    ## print
    print('Environment: Google Colaboratory.')

In [22]:
# usual imports
import numpy as np
import pandas as pd

import os
import time
import collections
import itertools
import six
import pickle
import joblib

# random state
SEED = 0
RNG = np.random.RandomState(SEED)

# sklearn
import sklearn
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Bhishan Poudel 2020-11-04 

CPython 3.7.7
IPython 7.18.1

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

sklearn   0.23.1
pandas    1.1.0
watermark 2.0.2
six       1.15.0
numpy     1.18.4
joblib    0.17.0



# Useful Scripts

In [27]:
def show_methods(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]

    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]

    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')

def adjustedR2(rsquared,nrows,kcols):
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)


def print_reg_metrics(yt,yp,ncols):
    rmse = np.sqrt(sklearn.metrics.mean_squared_error(yt,yp))
    r2 = sklearn.metrics.r2_score(yt, yp)
    ar2 = adjustedR2(r2, len(yt), ncols)

    out = f"""
    RMSE     : {rmse:,.2f}
    R-squared: {r2:,.6f}
    Adj R2   : {ar2:,.6f}
    """
    print(out)

# Parameters

In [7]:
if ENV_COLAB:
    path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
    proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
    data_path_parent = path_raw + proj

else:
    data_path_parent = '../data/'

target = 'price'
cols_drop = ['id', 'date', 'zipcode_top10']
cols_sq = ['bedrooms','bathrooms','floors','waterfront','view',
    'age','age_after_renovation','log1p_sqft_living','log1p_sqft_lot',
    'log1p_sqft_above','log1p_sqft_basement',
    'log1p_sqft_living15','log1p_sqft_lot15']

train_size = 0.8

In [10]:
target = 'price'

# Load the data

In [8]:
data_path_clean = data_path_parent + 'processed/data_cleaned_encoded.csv'
df = pd.read_csv(data_path_clean)

print(f"df shape : {df.shape}")
display(df.head(2).append(df.tail(2)))

df shape : (21613, 91)


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,age_after_renovation_cat_6,age_after_renovation_cat_7,age_after_renovation_cat_8,age_after_renovation_cat_9,log1p_sqft_living,log1p_sqft_lot,log1p_sqft_above,log1p_sqft_basement,log1p_sqft_living15,log1p_sqft_lot15
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,0,0,0,7.074117,8.639588,7.074117,0.0,7.201171,8.639588
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,0,0,0,0,7.85205,8.887791,7.682943,5.993961,7.433075,8.941153
21611,291310100,2015-01-16,400000.0,3,2.5,1600,2388,2.0,0,0,...,0,0,0,0,7.378384,7.77863,7.378384,0.0,7.252054,7.160846
21612,1523300157,2014-10-15,325000.0,2,0.75,1020,1076,2.0,0,0,...,0,0,0,0,6.928538,6.981935,6.928538,0.0,6.928538,7.213768


# Data Processing

## Sanity Check
- check for data leakage. eg. 'price', 'log1p_price' on columns
- make sure no Nans
- If you use `log(target)`, do not forget to do `exp(ypreds)` while doing model evaluation.

In [9]:
print(df.columns)

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age',
       'yr_renovated2', 'age_after_renovation', 'zipcode_top10',
       'zipcode_houses', 'basement_bool', 'renovation_bool', 'age_cat',
       'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0',
       'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2',
       'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10',
       'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5',
       'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004',
       'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039',
       'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105',
       'zipcode_top10_98155', 'zipcode_t

In [16]:
df.filter(regex='price').columns
# there is no data leakage, there is only one target column

Index(['price'], dtype='object')

In [17]:
df.filter(regex='log').columns

Index(['log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above',
       'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15'],
      dtype='object')

## Drop unwanted columns

In [18]:
df = df.drop(cols_drop, axis=1)

## Create squared columns

In [19]:
for col in cols_sq:
    df[col + '_sq'] = df[col]**2

## Train test split

In [23]:
df_Xtrain,df_Xtest,ser_ytrain,ser_ytest = model_selection.train_test_split(
    df.drop([target],axis=1),
    df[target],
    train_size=train_size,
    random_state=SEED)

ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()

## Scaling

In [24]:
scaler = preprocessing.StandardScaler()
scaler.fit(df_Xtrain)
Xtrain = scaler.transform(df_Xtrain)
Xtest  = scaler.transform(df_Xtest)

# Modelling: Random Forest

In [25]:
features = df.drop([target],axis=1).columns

In [30]:
model = RandomForestRegressor(random_state=SEED,n_jobs=-1)
model.fit(Xtrain,ytrain)

ypreds = model.predict(Xtest)
print_reg_metrics(ytest,ypreds,Xtest.shape[-1])


    RMSE     : 122,552.77
    R-squared: 0.888556
    Adj R2   : 0.885944
    


# Grid Search

Most important hyperparameters of Random Forest:

- n_estimators = n of trees
- max_features = max number of features considered for splitting a node
- max_depth = max number of levels in each decision tree
- min_samples_split = min number of data points placed in a node before the node is split
- min_samples_leaf = min number of data points allowed in a leaf node
- bootstrap = method for sampling data points (with or without replacement)

In [32]:
%%time
model = RandomForestRegressor(n_estimators= 50,random_state=SEED)

model.fit(Xtrain,ytrain)

ypreds = model.predict(Xtest)
print_reg_metrics(ytest,ypreds,Xtest.shape[-1])


    RMSE     : 121,626.72
    R-squared: 0.890234
    Adj R2   : 0.887661
    


# Randomized search

In [33]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in  forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)]

# max features
max_features = ['auto', 'sqrt']

# max depth of leaves
max_depth = [int(x) for x in np.linspace(1, 45, num = 3)]

# min samples split
min_samples_split = [5, 10]

# random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split}


pprint(random_grid)

{'max_depth': [1, 23, 45],
 'max_features': ['auto', 'sqrt'],
 'min_samples_split': [5, 10],
 'n_estimators': [20, 65, 110, 155, 200]}


In [36]:
%%time
model = RandomForestRegressor(random_state=SEED)
rf_random = RandomizedSearchCV(model,random_grid,
                               n_iter = 100,
                               cv = 5,
                               verbose=2,
                               random_state=SEED,
                               n_jobs = -1,
                               scoring='neg_mean_squared_error')
# Fit the random search model
# rf_random.fit(Xtrain, ytrain) # comment this

In [35]:
# rf_random.best_params_

"""
{'n_estimators': 110,
 'min_samples_split': 5,
 'max_features': 'auto',
 'max_depth': 45}
"""

{'n_estimators': 110,
 'min_samples_split': 5,
 'max_features': 'auto',
 'max_depth': 45}

In [37]:
params_rf_best = {'n_estimators': 110,
 'min_samples_split': 5,
 'max_features': 'auto',
 'max_depth': 45}

model = RandomForestRegressor(random_state=SEED,**params_rf_best)
model

RandomForestRegressor(max_depth=45, min_samples_split=5, n_estimators=110,
                      random_state=100)

In [38]:
%%time
model.fit(Xtrain,ytrain)

ypreds = model.predict(Xtest)
print_reg_metrics(ytest,ypreds,Xtest.shape[-1])


    RMSE     : 124,313.37
    R-squared: 0.885331
    Adj R2   : 0.882643
    


# Feature Importance

In [40]:
importances = model.feature_importances_
importances[:5]

array([0.00116118, 0.00444788, 0.08525432, 0.00405505, 0.00063537])

In [41]:
df_imp = pd.DataFrame({'feature': features,
                      'importance': importances})

df_imp.sort_values('importance', ascending=False)\
  .style.background_gradient(subset=['importance'])

Unnamed: 0,feature,importance
8,grade,0.324809
14,lat,0.15044
81,log1p_sqft_living,0.088471
2,sqft_living,0.085254
94,log1p_sqft_living_sq,0.071862
15,long,0.063767
98,log1p_sqft_living15_sq,0.009697
22,zipcode_houses,0.009553
13,zipcode,0.009523
16,sqft_living15,0.009467


# Time Taken

In [42]:
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

Time taken to run whole notebook: 1 hr 16 min 37 secs
