## Bagging and Random Forest Regressor on california husoing dataset

In [2]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor 

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit

from sklearn.tree import DecisionTreeRegressor

In [3]:
np.random.seed(306)

In [4]:
features, labels = fetch_california_housing(as_frame= True, return_X_y= True)
labels *= 100

com_train_features, dev_train_features, com_train_labels, dev_test_labels = train_test_split(features, labels, random_state= 42)

In [5]:
def train_regressor(estimator, x_train, y_train, cv, name):
    cv_result = cross_validate(estimator, 
                               x_train, 
                               y_train,
                               cv= cv,
                               scoring='neg_mean_absolute_error',
                               return_estimator= True,
                               return_train_score= True)
    cv_train_error = -1 * cv_result['train_score']
    cv_test_error = -1 * cv_result['test_score']

    print(f'On an average, {name} make an error of ',
          f'{cv_train_error.mean():.3f}k +/- {cv_train_error.std():.3f}k on the training set.')
    print(f'on an average, {name} makes an error of ',
          f'{cv_test_error.mean():.3f}k +/- {cv_test_error.std():.3f}k on the test set.')

In [6]:
cv= ShuffleSplit(n_splits= 5)


In [7]:
train_regressor(DecisionTreeRegressor(), com_train_features, com_train_labels, cv, 'decision tree regressor')

On an average, decision tree regressor make an error of  0.000k +/- 0.000k on the training set.
on an average, decision tree regressor makes an error of  46.616k +/- 1.430k on the test set.


In [8]:
train_regressor(
    BaggingRegressor(), com_train_features, com_train_labels,
    cv, 'bagging regressor'
)

On an average, bagging regressor make an error of  14.159k +/- 0.247k on the training set.
on an average, bagging regressor makes an error of  35.151k +/- 1.286k on the test set.


In [9]:
train_regressor(
    RandomForestRegressor(), com_train_features, com_train_labels,
    cv, 'Random Froest regressor'
)

On an average, Random Froest regressor make an error of  12.457k +/- 0.034k on the training set.
on an average, Random Froest regressor makes an error of  33.112k +/- 0.773k on the test set.


### Parameter search for random forest regressor

In [11]:
param_distribution = {
    'n_estimators': [1, 2, 5, 10, 20, 50, 100, 200, 500],
    'max_leaf_nodes': [2, 5, 10, 20, 50, 100]
}

In [17]:
search_cv = RandomizedSearchCV(
    estimator=RandomForestRegressor(), param_distributions=param_distribution,
                       scoring= 'neg_mean_absolute_error', n_iter= 10, n_jobs= -1,
                       )

In [18]:
search_cv.fit(com_train_features, com_train_labels)

In [23]:
columns= [f'params_{name}' for name in param_distribution.keys()]
columns += ['mean_test_error', 'std_test_error']
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results['mean_test_error'] = -cv_results['mean_test_score']
cv_results['std_test_error'] = cv_results['std_test_score']
# cv_results[columns].sort_values(by='mean_test_error')
cv_results.sort_values(by='mean_test_error')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_leaf_nodes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,mean_test_error,std_test_error
2,5.039439,0.423358,0.023687,0.00156,50,100,"{'n_estimators': 50, 'max_leaf_nodes': 100}",-41.749087,-39.690829,-40.767141,-40.049271,-41.226962,-40.696658,0.751611,1,40.696658,0.751611
7,0.920336,0.008389,0.007111,0.000201,10,100,"{'n_estimators': 10, 'max_leaf_nodes': 100}",-43.309804,-40.441779,-41.027689,-40.776149,-42.322303,-41.575545,1.076095,2,41.575545,1.076095
5,0.511934,0.038287,0.005434,0.000474,5,100,"{'n_estimators': 5, 'max_leaf_nodes': 100}",-43.569246,-40.228232,-41.750003,-41.80836,-41.952251,-41.861618,1.059005,3,41.861618,1.059005
6,42.309256,0.684748,0.148714,0.018285,500,50,"{'n_estimators': 500, 'max_leaf_nodes': 50}",-44.861668,-42.529193,-43.510944,-43.794366,-44.569308,-43.853096,0.825035,4,43.853096,0.825035
3,4.349425,0.220653,0.022009,0.002776,50,50,"{'n_estimators': 50, 'max_leaf_nodes': 50}",-44.667984,-42.276557,-43.65579,-43.892667,-44.798472,-43.858294,0.903727,5,43.858294,0.903727
9,28.273892,4.412363,0.120694,0.029757,500,20,"{'n_estimators': 500, 'max_leaf_nodes': 20}",-50.526051,-47.678526,-49.136099,-49.781845,-50.445601,-49.513624,1.046484,6,49.513624,1.046484
0,5.382911,0.391847,0.028006,0.001338,100,10,"{'n_estimators': 100, 'max_leaf_nodes': 10}",-55.67179,-52.985064,-55.480267,-55.30831,-55.8424,-55.057566,1.05167,7,55.057566,1.05167
1,0.06413,0.013159,0.003472,0.000273,1,10,"{'n_estimators': 1, 'max_leaf_nodes': 10}",-58.450888,-56.659274,-57.986288,-58.816297,-59.278567,-58.238263,0.896467,8,58.238263,0.896467
4,1.33144,0.132297,0.009866,0.000322,50,2,"{'n_estimators': 50, 'max_leaf_nodes': 2}",-73.50802,-71.114916,-73.379013,-72.877396,-73.850965,-72.946062,0.967461,9,72.946062,0.967461
8,0.500475,0.007437,0.006012,0.000142,20,2,"{'n_estimators': 20, 'max_leaf_nodes': 2}",-73.790327,-71.557302,-73.345966,-72.758929,-74.151624,-73.12083,0.90962,10,73.12083,0.90962
