<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Modelling-with-Polynomial-Features-and-Select-Kbest" data-toc-modified-id="Modelling-with-Polynomial-Features-and-Select-Kbest-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Modelling with Polynomial Features and Select Kbest</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Useful-Scripts" data-toc-modified-id="Useful-Scripts-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Useful Scripts</a></span></li><li><span><a href="#Single-Script" data-toc-modified-id="Single-Script-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Single Script</a></span></li><li><span><a href="#Searching-Best-k" data-toc-modified-id="Searching-Best-k-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Searching Best k</a></span></li><li><span><a href="#Grid-Search-for-Random-Forest" data-toc-modified-id="Grid-Search-for-Random-Forest-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Grid Search for Random Forest</a></span></li><li><span><a href="#Use-the-best-parameters-from-grid-search" data-toc-modified-id="Use-the-best-parameters-from-grid-search-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Use the best parameters from grid search</a></span></li></ul></div>

# Modelling with Polynomial Features and Select Kbest

# Imports

In [1]:
import numpy as np
import pandas as pd

import os
import time
import collections
import itertools

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score

# random state
random_state=100
np.random.seed(random_state) # we need this in each cell
np.random.set_state=random_state

# Useful Scripts

In [2]:
def adjustedR2(rsquared,nrows,kcols):
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)

In [3]:
def add_interactions(df):
    from itertools import combinations
    from sklearn.preprocessing import PolynomialFeatures

    # Get feature names
    combos = list(combinations(list(df.columns), 2))
    colnames = list(df.columns) + ['_'.join(x) for x in combos]
    
    # Find interactions
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = colnames
    
    # Remove interaction terms with all 0 values            
    noint_indicies = [i for i, x in enumerate(list((df == 0).all())) if x]
    df = df.drop(df.columns[noint_indicies], axis=1)
    
    return df

# Single Script

In [11]:
# data
df = pd.read_csv('../data/processed/data_cleaned_encoded.csv')

features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'basement_bool', 'renovation_bool', 'zipcode_houses', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004', 'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039', 'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105', 'zipcode_top10_98155', 'zipcode_top10_98177', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9']

target = ['price']

# add interaction features to X (excluding target)
df_Xlarge = add_interactions(df[features])
df_large = pd.concat([df_Xlarge, df[target]], axis=1)

# train test split of very large dataframe
df_train_large, df_test_large = train_test_split(df_large, test_size=0.2,
                                                 random_state=random_state)

# fit the select KBest on training data to get best features
def regression_modelling_with_kbest(k):
    model_kbest = SelectKBest(k=k)
    model_kbest = model_kbest.fit(df_train_large.drop(target,1).to_numpy(),
                                  df_train_large[target].to_numpy().ravel())

    idx_kbest = model_kbest.get_support(indices=True)
    cols_kbest = [df_train_large.columns[i] for i in idx_kbest]


    # get numpy arrays using best features
    Xtrain_kbest = df_train_large[cols_kbest].to_numpy()
    ytrain_kbest = df_train_large[target].to_numpy().ravel()

    Xtest_kbest = df_test_large[cols_kbest].to_numpy()
    ytest_kbest = df_test_large[target].to_numpy().ravel()


    # model
    model = RandomForestRegressor(n_estimators=100,verbose=0,
                                 random_state=random_state,n_jobs=-1)
    # model = LinearRegression()

    # fitting
    model.fit(Xtrain_kbest,ytrain_kbest)

    # prediction
    ypreds = model.predict(Xtest_kbest)

    # model evaluation
    r2 = r2_score(ytest_kbest,ypreds)
    ar2 = adjustedR2(r2,Xtest_kbest.shape[0],Xtest_kbest.shape[1])
    print('k = ', k)
    #print('R-squared Value for Test = ', round(r2,3))
    print('Adjusted R-squared Value for Test = ', round(ar2,3))

# Searching Best k

In [10]:
import warnings
warnings.filterwarnings("ignore")

for k in [441]:
    regression_modelling_with_kbest(k)

k =  443
Adjusted R-squared Value for Test =  0.861
k =  444
Adjusted R-squared Value for Test =  0.857


```
k =  400
Adjusted R-squared Value for Test =  0.828

k =  420
Adjusted R-squared Value for Test =  0.853

k =  430
Adjusted R-squared Value for Test =  0.859

k =  438
Adjusted R-squared Value for Test =  0.86

k =  439
Adjusted R-squared Value for Test =  0.858

k =  440
Adjusted R-squared Value for Test =  0.86

k =  441
Adjusted R-squared Value for Test =  0.862

k =  442
Adjusted R-squared Value for Test =  0.859

k =  443
Adjusted R-squared Value for Test =  0.861

k =  444
Adjusted R-squared Value for Test =  0.857

k =  445
Adjusted R-squared Value for Test =  0.855

k =  460
Adjusted R-squared Value for Test =  0.858

k =  480
Adjusted R-squared Value for Test =  0.855

k =  500
Adjusted R-squared Value for Test =  0.857

k =  600
Adjusted R-squared Value for Test =  0.853

k =  620
Adjusted R-squared Value for Test =  0.854

k =  640
Adjusted R-squared Value for Test =  0.849

k =  660
Adjusted R-squared Value for Test =  0.851

k =  680
Adjusted R-squared Value for Test =  0.847

k =  700
Adjusted R-squared Value for Test =  0.85
```

In [19]:
k = 441
model_kbest = SelectKBest(k=k)
model_kbest = model_kbest.fit(df_train_large.drop(target,1).to_numpy(),
                              df_train_large[target].to_numpy().ravel())

idx_kbest = model_kbest.get_support(indices=True)
cols_kbest = [df_train_large.columns[i] for i in idx_kbest]


# get numpy arrays using best features
Xtrain_kbest = df_train_large[cols_kbest].to_numpy()
ytrain_kbest = df_train_large[target].to_numpy().ravel()

Xtest_kbest = df_test_large[cols_kbest].to_numpy()
ytest_kbest = df_test_large[target].to_numpy().ravel()
    
# model
model = RandomForestRegressor(n_estimators=100,verbose=0,
                             random_state=random_state,n_jobs=-1)

# fitting
model.fit(Xtrain_kbest,ytrain_kbest)

# prediction
ypreds = model.predict(Xtest_kbest)

# model evaluation
r2 = r2_score(ytest_kbest,ypreds)
ar2 = adjustedR2(r2,Xtest_kbest.shape[0],Xtest_kbest.shape[1])
print('k = ', k)
#print('R-squared Value for Test = ', round(r2,3))
print('Adjusted R-squared Value for Test = ', round(ar2,3))

k =  441
Adjusted R-squared Value for Test =  0.862


# Grid Search for Random Forest

In [26]:
from sklearn.model_selection import RandomizedSearchCV

t0 = time.time()

model = RandomForestRegressor(random_state=random_state)


param_dist = {'n_estimators': [40,60,80,100,120,140],
 'max_features': [2,10,20], 
 'max_depth': [10, 50, None],
 'bootstrap': [True, False]}

grid_search_forest = RandomizedSearchCV(model, param_distributions=param_dist,
                                        cv=5,
                                        iid=False,
                                        n_jobs=-1,
                                        scoring='r2',
                                        verbose=1)

grid_search_forest.fit(Xtrain_kbest, ytrain_kbest)

t1 = time.time() - t0
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(t1,60)))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   36.6s finished


Time taken: 0 min 40 secs


In [27]:
# from sklearn.model_selection import GridSearchCV

# t0 = time.time()

# model = RandomForestRegressor(random_state=random_state)


# param_grid = [
# {'n_estimators': [40,60,80,100,120,140],
#  'max_features': [2,10,20], 
#  'max_depth': [10, 50, None],
#  'bootstrap': [True, False]}
# ]

# grid_search_forest = GridSearchCV(model,
#                                   param_grid,
#                                   cv=5,
#                                   n_jobs=-1,
#                                   scoring='r2',
#                                   verbose=1)

# grid_search_forest.fit(Xtrain_kbest, ytrain_kbest)

# t1 = time.time() - t0
# print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(t1,60)))

# Use the best parameters from grid search

In [28]:
grid_search_forest.best_estimator_

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=50,
                      max_features=20, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40,
                      n_jobs=None, oob_score=False, random_state=100, verbose=0,
                      warm_start=False)

In [29]:
grid_search_forest.best_params_

{'n_estimators': 40, 'max_features': 20, 'max_depth': 50, 'bootstrap': False}

In [30]:
k = 441
model_kbest = SelectKBest(k=k)
model_kbest = model_kbest.fit(df_train_large.drop(target,1).to_numpy(),
                              df_train_large[target].to_numpy().ravel())

idx_kbest = model_kbest.get_support(indices=True)
cols_kbest = [df_train_large.columns[i] for i in idx_kbest]


# get numpy arrays using best features
Xtrain_kbest = df_train_large[cols_kbest].to_numpy()
ytrain_kbest = df_train_large[target].to_numpy().ravel()

Xtest_kbest = df_test_large[cols_kbest].to_numpy()
ytest_kbest = df_test_large[target].to_numpy().ravel()
    
# model
model = RandomForestRegressor(n_estimators=40,verbose=0,
                             random_state=random_state,n_jobs=-1,
                             max_features=20,
                             max_depth=50,
                             bootstrap=False)

# fitting
model.fit(Xtrain_kbest,ytrain_kbest)

# prediction
ypreds = model.predict(Xtest_kbest)

# model evaluation
r2 = r2_score(ytest_kbest,ypreds)
ar2 = adjustedR2(r2,Xtest_kbest.shape[0],Xtest_kbest.shape[1])
print('k = ', k)
#print('R-squared Value for Test = ', round(r2,3))
print('Adjusted R-squared Value for Test = ', round(ar2,3))

k =  441
Adjusted R-squared Value for Test =  0.814


In [32]:
k = 441
model_kbest = SelectKBest(k=k)
model_kbest = model_kbest.fit(df_train_large.drop(target,1).to_numpy(),
                              df_train_large[target].to_numpy().ravel())

idx_kbest = model_kbest.get_support(indices=True)
cols_kbest = [df_train_large.columns[i] for i in idx_kbest]


# get numpy arrays using best features
Xtrain_kbest = df_train_large[cols_kbest].to_numpy()
ytrain_kbest = df_train_large[target].to_numpy().ravel()

Xtest_kbest = df_test_large[cols_kbest].to_numpy()
ytest_kbest = df_test_large[target].to_numpy().ravel()
    
# model
model = RandomForestRegressor(n_estimators= 100,random_state=random_state,
                              max_features=69,
                              max_depth=50, bootstrap=True)

# fitting
model.fit(Xtrain_kbest,ytrain_kbest)

# prediction
ypreds = model.predict(Xtest_kbest)

# model evaluation
r2 = r2_score(ytest_kbest,ypreds)
ar2 = adjustedR2(r2,Xtest_kbest.shape[0],Xtest_kbest.shape[1])
print('k = ', k)
#print('R-squared Value for Test = ', round(r2,3))
print('Adjusted R-squared Value for Test = ', round(ar2,3))

k =  441
Adjusted R-squared Value for Test =  0.843
