In [1]:
import numpy as np
import pandas as pd

In [2]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url , sep = '\s+' , skiprows = 22 , header = None)
data = np.hstack([raw_df.values[::2 , :] , raw_df.values[1::2 , :2]])
target = raw_df.values[1::2 , 2]

In [3]:
#splitting data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data,target,test_size=0.3)

Building a random forest model

In [4]:
from sklearn import ensemble
dt = ensemble.RandomForestRegressor(n_estimators = 5 , max_depth = 3)
dt.fit(x_train , y_train)
print('Training score: ' , dt.score(x_train , y_train))
print('Test score: ',dt.score(x_test , y_test))

Training score:  0.8525184238966521
Test score:  0.7788287379003003


Cross validation

In [5]:
from sklearn.model_selection import cross_val_score
scores1 = cross_val_score(ensemble.RandomForestRegressor(n_estimators=5,max_depth=3),x_train,y_train,cv=10)
np.average(scores1)

0.7582625655584981

In [6]:
ensemble.RandomForestRegressor().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
model=ensemble.RandomForestRegressor()
from sklearn.model_selection import GridSearchCV
parameters=[{'n_estimators':[20,30,40,60,100], 'max_depth':  
             [5,10,15,20]},
             {'n_estimators':[20,30,40,60,100], 'max_depth': 
             [5,10,15,20],'max_features':[2,5,8]}]
             
grid_search = GridSearchCV(estimator=model,
                           param_grid=parameters,
                           cv=10,
                           n_jobs=-1)
                           
grid = grid_search.fit(x_train,y_train)
grid.best_score_

0.8650088963436182

In [8]:
grid.best_params_

{'max_depth': 20, 'max_features': 5, 'n_estimators': 100}

In [9]:
#trying the best hyperparameters suggested by GridSearchCV
#{'max_depth': 10, 'max_features': 5, 'n_estimators': 100}

from sklearn import ensemble
dt_1=ensemble.RandomForestRegressor(n_estimators=100, max_depth=10, max_features = 5)
dt_1.fit(x_train,y_train)
print('training score: ', dt_1.score(x_train,y_train))
print('test score: ',dt_1.score(x_test,y_test))

training score:  0.9795252365773697
test score:  0.8536906842659009


# Random Search CV

In [10]:
from sklearn.model_selection import RandomizedSearchCV
model=ensemble.RandomForestRegressor()
param_grid=[{'n_estimators':[20,30,40,60,100], 'max_depth':[5,10,15,20] 
          },{'n_estimators':[20,30,40,60,100], 'max_depth':[5,10,15,20]
             ,'max_features':[2,5,8]}]

rnd_search = RandomizedSearchCV(model, param_grid, cv=10, 
                                          return_train_score=True)
rnd_search.fit(x_train,y_train)
rnd_search.best_score_

0.8628749222275539

In [11]:
rnd_search.best_params_

{'n_estimators': 100, 'max_features': 5, 'max_depth': 20}

In [12]:
from sklearn import ensemble
dt=ensemble.RandomForestRegressor(n_estimators=20, max_depth=10, max_features = 8)
dt.fit(x_train,y_train)
print('training score: ', dt.score(x_train,y_train))
print('test score: ',dt.score(x_test,y_test))

training score:  0.9756366227108105
test score:  0.8375252928431178


# Save and Load ML Models

1) Finalize Your Model with pickle

2) Finalize Your Model with Joblib

# 1) Finalize Your Model with pickle

In [13]:
import pickle

In [14]:
pickle.dump(dt_1 , open('model.pkl','wb'))

In [15]:
pickled_model = pickle.load(open('model.pkl' , 'rb'))

In [16]:
pickled_model.predict(x_test)

array([43.58069167, 19.52357399, 27.15505211, 17.23942702, 22.55740799,
       10.12193991, 34.61857143, 33.27361833, 21.55705926, 24.16831192,
       44.1668    , 16.84603333, 32.166     , 14.63719455, 23.69379114,
       15.25694476, 19.98922062, 17.51224718, 20.23081833, 31.11230525,
       19.13205402, 27.47703824, 24.33688184, 37.15402222, 36.02341111,
       31.56343579, 25.54851305, 17.69234067, 25.9974132 , 26.51791454,
       19.4706914 , 14.13353146, 19.60444082, 20.33973998, 19.34043055,
       10.29688663, 31.97216746, 15.83761673,  9.72684643, 36.69133333,
       10.00352029, 23.11966903, 21.33507661, 18.87388846, 17.90959401,
       11.67946777, 14.94841172, 17.18894824, 23.35190528, 20.66114448,
       20.62169932, 23.73528639, 34.54464683, 13.54950073, 17.27531784,
       21.02262107, 34.52913651, 19.59898463, 21.69206172, 32.30230952,
       23.96570671, 24.4135795 , 14.67286911, 16.16366145, 14.22009699,
       24.38186736, 10.00277121, 20.75657514, 13.37336797, 19.40

# Finalize Your Model with Joblib

pip install joblib

In [17]:
import joblib
joblib.dump(dt_1 , 'joblib_model')

['joblib_model']

In [18]:
jobLib = joblib.load('joblib_model')

In [19]:
jobLib.predict(x_test)

array([43.58069167, 19.52357399, 27.15505211, 17.23942702, 22.55740799,
       10.12193991, 34.61857143, 33.27361833, 21.55705926, 24.16831192,
       44.1668    , 16.84603333, 32.166     , 14.63719455, 23.69379114,
       15.25694476, 19.98922062, 17.51224718, 20.23081833, 31.11230525,
       19.13205402, 27.47703824, 24.33688184, 37.15402222, 36.02341111,
       31.56343579, 25.54851305, 17.69234067, 25.9974132 , 26.51791454,
       19.4706914 , 14.13353146, 19.60444082, 20.33973998, 19.34043055,
       10.29688663, 31.97216746, 15.83761673,  9.72684643, 36.69133333,
       10.00352029, 23.11966903, 21.33507661, 18.87388846, 17.90959401,
       11.67946777, 14.94841172, 17.18894824, 23.35190528, 20.66114448,
       20.62169932, 23.73528639, 34.54464683, 13.54950073, 17.27531784,
       21.02262107, 34.52913651, 19.59898463, 21.69206172, 32.30230952,
       23.96570671, 24.4135795 , 14.67286911, 16.16366145, 14.22009699,
       24.38186736, 10.00277121, 20.75657514, 13.37336797, 19.40