In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
seed = 42

from sklearn import preprocessing
mms = preprocessing.MinMaxScaler()
import warnings
warnings.filterwarnings("ignore")

In [4]:
X_train = pd.read_csv('X_train.csv')
X_train = X_train.loc[:, ~X_train.columns.str.contains('^Unnamed')]

y_train = pd.read_csv('y_train.csv')
y_train = y_train.loc[:, ~y_train.columns.str.contains('^Unnamed')]

In [7]:
scoring = 'neg_mean_squared_error'

randomforest = RandomForestRegressor()

rf_pipe = Pipeline([
    ('mms', mms),
    ('randomforest', randomforest)])

params_rf = {
    'randomforest__n_estimators': [50,100,200],
    'randomforest__max_features': ['auto', 'sqrt'],
    #'randomforest__max_depth': [10, 50, 90, None],
    #'randomforest__min_samples_split': [2, 5, 10],
    #'randomforest__min_samples_leaf': [1, 2, 4],
    #'randomforest__bootstrap': [True, False]
}

grid_search_rf = GridSearchCV(rf_pipe, param_grid=params_rf, cv=3, verbose=2, n_jobs=-1, scoring=scoring)
grid_search_rf.fit(X_train, y_train.values.ravel())
print(grid_search_rf.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 74.2min finished


{'randomforest__max_features': 'auto', 'randomforest__n_estimators': 200}


In [8]:
randomforest = RandomForestRegressor()

rf_pipe = Pipeline([
    ('mms', mms),
    ('randomforest', randomforest)])

params_rf = {
    'randomforest__n_estimators': [200],
    'randomforest__max_features': ['auto'],
    'randomforest__max_depth': [10, 50, 90, None],
    'randomforest__min_samples_split': [2, 5, 10],
    #'randomforest__min_samples_leaf': [1, 2, 4],
    #'randomforest__bootstrap': [True, False]
}

grid_search_rf = GridSearchCV(rf_pipe, param_grid=params_rf, cv=3, verbose=2, n_jobs=-1, scoring=scoring)
grid_search_rf.fit(X_train, y_train.values.ravel())
print(grid_search_rf.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 279.1min finished


{'randomforest__max_depth': 90, 'randomforest__max_features': 'auto', 'randomforest__min_samples_split': 2, 'randomforest__n_estimators': 200}


In [9]:
randomforest = RandomForestRegressor()

rf_pipe = Pipeline([
    ('mms', mms),
    ('randomforest', randomforest)])

params_rf = {
    'randomforest__n_estimators': [200],
    'randomforest__max_features': ['auto'],
    'randomforest__max_depth': [90],
    'randomforest__min_samples_split': [2],
    'randomforest__min_samples_leaf': [1, 2, 4],
    'randomforest__bootstrap': [True, False]
}

grid_search_rf = GridSearchCV(rf_pipe, param_grid=params_rf, cv=3, verbose=2, n_jobs=-1, scoring=scoring)
grid_search_rf.fit(X_train, y_train.values.ravel())
print(grid_search_rf.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 192.8min finished


{'randomforest__bootstrap': True, 'randomforest__max_depth': 90, 'randomforest__max_features': 'auto', 'randomforest__min_samples_leaf': 1, 'randomforest__min_samples_split': 2, 'randomforest__n_estimators': 200}


In [11]:
import pickle

rf_params = {
    'n_estimators': 200,
    'max_features': 'auto',
    'max_depth': 90,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'bootstrap': True   
}

#save params to file 
Pkl_Filename = "RF_Params.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(rf_params, file)