In [4]:

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [5]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    r2 = r2_score(y_pred=predictions, y_true=test_labels)

    print('Model Performance')
    print('R2 = {:0.2f}%.'.format(r2))

    return [r2]


In [6]:
# %%
boston = pd.read_csv("Boston.csv")
X = boston.drop(columns=["zn", "medv"])
y = boston["medv"]

In [20]:
boston.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [7]:
# Split data into a 70/30 train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=123)

rf = RandomForestRegressor()

In [13]:
# Create the parameter grid based on the results of random search
# This section can be modified to look into other possible parameters.
param_grid = {
    'max_depth': [80, 90 , 100, 200, None],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'n_estimators': [10, 50, 100, 200, 300, 500, 1000]
}

grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,  # parameters to be tuned
                           cv=10,
                           n_jobs=-1,  # -1 means use all available cores
                           verbose=2)

In [14]:
# Perform CV search over grid of hyperparameters
grid_search.fit(X_train, y_train)

print("Best CV accuracy: {}, with parameters: {}".format(
    grid_search.best_score_, grid_search.best_params_))

best_model = grid_search.best_estimator_
r2 = evaluate(best_model, X_test, y_test)




Fitting 10 folds for each of 140 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 178 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 381 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 664 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1029 tasks      | elapsed:  3.6min


Best CV accuracy: 0.875893602002763, with parameters: {'max_depth': 200, 'max_features': 'log2', 'n_estimators': 100}
Model Performance
R2 = 0.85%.


[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:  5.1min finished


In [16]:
# Now that we know the best hyperparameter, we refit our RF model
# using them and the full dataset at our disposal
best_params = grid_search.best_params_
full_model = RandomForestRegressor(**best_params)
full_fit = full_model.fit(X, y)

In [18]:
full_fit

RandomForestRegressor(max_depth=200, max_features='log2')