# ML Workflow - Supervised Learning (Extra Tools)

![Image](./img/scikit_learn.png)


In [None]:
# imports 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

## [Ensemble methods](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble)

__Averaging methods:__ the driving principle is to build several estimators independently and then to average their predictions.

- [RandomForestRegressor()](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
 

__boosting methods:__ combine several weak models to produce a powerful ensemble.

- [GradientBoostingRegressor()](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)


In [None]:
# Load datasets

X, y = make_regression(n_samples=1000, n_features=10, random_state=42)
#X, y = datasets.load_diabetes(return_X_y=True)
print(X.shape, y.shape)

(1000, 10) (1000,)


In [None]:
%%time

model = RandomForestRegressor(random_state = 42)
#model = GradientBoostingRegressor(random_state = 42)

model.fit(X, y)
y_pred = model.predict(X)

hyperparameters = model.get_params()

print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

<class 'sklearn.ensemble._forest.RandomForestRegressor'> 

Model hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False} 

Wall time: 2.26 s


---

## [Cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html)

![Image](./img/cross_validation.jpeg)

In [None]:
%%time

scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5,
                         n_jobs=-1)

print(type(model), '\n')
print(scores, '\n')
print(np.mean(-scores), '\n')

<class 'sklearn.ensemble._forest.RandomForestRegressor'> 

[-51.99802694 -58.80892432 -48.32443715 -45.61734778 -57.97155038] 

52.54405731452518 

Wall time: 25 s


In [None]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_rmse = RandomForestRegressor(random_state = 42)
#model_rmse = GradientBoostingRegressor(random_state = 42)

model_rmse.fit(X_train, y_train)
y_pred = model_rmse.predict(X_test)

hyperparameters = model_rmse.get_params()

rmse = mean_squared_error(y_test, y_pred)**0.5

print(type(model_rmse), '\n')
print(rmse, '\n')

---

## [GridSearchCV()](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

Alternatively, you may use [RandomizedSearchCV()](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) if you have limited resources.

In [None]:
param_grid = {'n_estimators': [16, 32, 64, 128, 256, 512],
              'max_depth': [2, 4, 8, 16]}

In [None]:
grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=3, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1)

In [None]:
%%time

grid_search.fit(X,y)

print('\n', 'Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

---