In this assignment students will build the random forest model after normalizing the variable to house pricing from boston data set.

In [1]:
# necessary imports
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier, RandomForestRegressor
from sklearn import datasets
import pydotplus
from IPython.display import Image 
boston = datasets.load_boston() 
features = pd.DataFrame(boston.data, columns=boston.feature_names) 
targets = boston.target 

In [2]:
# adding column names
features.columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B 1000", "LSTAT"]
# first five rows from the dataset
features.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B 1000,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [3]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B 1000   506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


In [4]:
# creation of train test split
x_train,x_test,y_train,y_test = train_test_split(features,targets, test_size= 0.30, random_state = 355)

In [9]:
# lets use random forest regressor
regressor = RandomForestRegressor(random_state=6, n_jobs=-1)

In [10]:
# training the model
regressor.fit(x_train,y_train)

RandomForestRegressor(n_jobs=-1, random_state=6)

In [11]:
# checking the accuracy
regressor.score(x_test,y_test)

0.8843800512856986

In [12]:
# we are tuning hyperparameters right now, we are passing the different values for each parameter
grid_param = {
    "n_estimators" : [100,115],
    'criterion': ['mse', 'mae'],
    'max_depth' : range(20,25,1),
    'min_samples_leaf' : range(1,5,1),
    'min_samples_split': range(1,10,1),
    'max_features' : ['auto']
}

In [13]:
grid_search = GridSearchCV(estimator=regressor,param_grid=grid_param,cv=5,n_jobs =-1,verbose = 3)

In [14]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed:  8.3min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=-1, random_state=6),
             n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],
                         'max_depth': range(20, 25), 'max_features': ['auto'],
                         'min_samples_leaf': range(1, 5),
                         'min_samples_split': range(1, 10),
                         'n_estimators': [100, 115]},
             verbose=3)

In [19]:
#let's see the best parameters as per our grid search
grid_search.best_params_

{'criterion': 'mse',
 'max_depth': 22,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [20]:
# using the best parameters
regressor = RandomForestRegressor(bootstrap=True,
    n_estimators=100,
    criterion="mse",
    max_depth=22,
    min_samples_leaf=1,
    min_samples_split=2,
    max_features="auto",
    random_state=6,
    n_jobs=-1)

In [21]:
# training the model
regressor.fit(x_train,y_train)

RandomForestRegressor(max_depth=22, n_jobs=-1, random_state=6)

In [22]:
# accuracy of the model 
regressor.score(x_test,y_test)

0.8843843171530079

There is no improvement after hyperparameter tuning. The accuracy score plain random forest regressor is 88.44%