In [1]:
import numpy as np 
 
import pandas as pd 
 
import matplotlib.pyplot as plt 
 
import seaborn as sns 
 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn import datasets 
from sklearn.ensemble import RandomForestRegressor

boston = datasets.load_boston() 
 
features = pd.DataFrame(boston.data, columns=boston.feature_names) 
 
targets = boston.target 
 
 

In [2]:
data = features.copy()

In [3]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


### normalizing our data as told... but random forest does not need normalization... decision trees robust to outliers

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)

In [7]:
x_train,x_test,y_train,y_test = train_test_split(X_scaled,targets,test_size = 0.30, random_state= 355)

In [8]:
rand_reg = RandomForestRegressor(random_state=6)

In [9]:
rand_reg.fit(x_train,y_train)
rand_reg.score(x_test,y_test)

0.8844530957942998

## We got 88.4% accuracy

In [10]:
grid_param = {
    "n_estimators" : [90,100,115,130],
    'criterion': ['mse'],
    'max_depth' : range(2,20,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'max_features' : ['auto','log2']
}

In [11]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=rand_reg,param_grid=grid_param,cv=5,n_jobs =-1,verbose = 3)

In [56]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 10368 candidates, totalling 51840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e

GridSearchCV(cv=5,
             estimator=RandomForestRegressor(max_depth=10, min_samples_split=6,
                                             random_state=6),
             n_jobs=-1,
             param_grid={'criterion': ['mse'], 'max_depth': range(2, 20),
                         'max_features': ['auto', 'log2'],
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10),
                         'n_estimators': [90, 100, 115, 130]},
             verbose=3)

In [12]:
grid_search.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [13]:
rand_reg = RandomForestRegressor( criteiiomax_depth = 12, max_features = 'log2', min_samples_leaf = 1, min_samples_split= 3, n_estimators = 115)

TypeError: __init__() got an unexpected keyword argument 'criteiiomax_depth'

rand_reg = RandomForestRegressor( criterion='mae',
 max_depth = 2,
 max_features = 'auto',
 min_samples_leaf = 1,
 min_samples_split= 2,
 n_estimators = 90)

In [14]:
rand_reg.fit(x_train,y_train)

RandomForestRegressor(random_state=6)

In [15]:
rand_reg.score(x_test,y_test)

0.8844530957942998

In [16]:
from sklearn.model_selection import RandomizedSearchCV

rand_search = RandomizedSearchCV(estimator=rand_reg,param_distributions=grid_param,cv=5,n_jobs =-1,verbose = 3)

In [17]:
rand_search.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    9.9s finished


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=6),
                   n_jobs=-1,
                   param_distributions={'criterion': ['mse'],
                                        'max_depth': range(2, 20),
                                        'max_features': ['auto', 'log2'],
                                        'min_samples_leaf': range(1, 10),
                                        'min_samples_split': range(2, 10),
                                        'n_estimators': [90, 100, 115, 130]},
                   verbose=3)

In [18]:
rand_search.best_estimator_

RandomForestRegressor(max_depth=12, min_samples_leaf=2, min_samples_split=3,
                      n_estimators=115, random_state=6)

In [25]:
rand_reg = RandomForestRegressor(max_depth=10, min_samples_split=6, random_state=6)

rand_reg.fit(x_train,y_train)
rand_reg.score(x_test,y_test)

0.8800899586197339

### performance metrics

In [20]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

y_pred = rand_reg.predict(x_test)
r2_score(y_test,y_pred)

0.8800899586197339

In [21]:
mean_squared_error(y_test,y_pred)

11.357577476030963

In [22]:
mean_absolute_error(y_test,y_pred)

2.4451933330017295