#  Bagging and Random Forest

In [None]:
#Import libraries
import matplotlib.pyplot as plt

from keras.datasets import mnist

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, ShuffleSplit

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

from sklearn.pipeline import Pipeline


# Classification

## Loading MNIST data

In [None]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [None]:
print("Shape of training data", X_train.shape)
print("Shape of test data", X_test.shape)

print("Shape of training labels", y_train.shape)
print("Shape of test labels", y_test.shape)

In [None]:
#Flattening 
X_train = X_train.reshape(X_train.shape[0], 28*28)
X_test = X_test.reshape(X_test.shape[0], 28*28)

 #Normalising
X_train = X_train/255
X_test = X_test/255

In [None]:
print("Shape of training data", X_train.shape)
print("Shape of test data", X_test.shape)

print("Shape of training labels", y_train.shape)
print("Shape of test labels", y_test.shape)

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

## Some helper functions

In [None]:
def train_classifiers(estimator, X_train, y_train, cv, name):
    estimator.fit(X_train, y_train)
    cv_train_score = cross_val_score(estimator, X_train, y_train, cv = cv, scoring='f1_macro')

    print(f"On an average, {name} model has f1 score of " 
            f"{cv_train_score.mean():.3f} +/- {cv_train_score.std():.3f} on the training set.")

In [None]:
def eval(estimator, X_test, y_test):
    y_pred = estimator.predict(X_test)

    print("# Classification report")
    print(classification_report(y_test, y_pred))

    print("# Confusion matrix")
    disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred))
    disp.plot()
    plt.title('Confusion matrix')
    plt.show()

## Decision trees for MNIST multiclass classification

In [None]:
decision_tree_pipeline = Pipeline([("classifier", DecisionTreeClassifier())])
train_classifiers(decision_tree_pipeline, X_train, y_train.ravel(), cv, "decision tree")

In [None]:
eval(decision_tree_pipeline, X_test, y_test)

## MNIST classification with bagging

In [None]:
bagging_pipeline = Pipeline([("Classifier", BaggingClassifier())])
train_classifiers(bagging_pipeline, X_train, y_train.ravel(), cv, "bagging")

In [None]:
eval(bagging_pipeline, X_test, y_test)

## Random forest 

In [None]:
rf_pipeline = Pipeline([("classifier", RandomForestClassifier())])
train_classifiers(rf_pipeline, X_train, y_train.ravel(), cv, "bagging")

In [None]:
eval(rf_pipeline, X_test, y_test)

# Regression

In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

from sklearn.model_selection import cross_validate, train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.metrics import mean_absolute_error

from sklearn.tree import DecisionTreeRegressor

In [None]:
np.random.seed(306)

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [None]:
features, labels = fetch_california_housing(as_frame=True, return_X_y = True)
labels *= 100

com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)

train_features, dev_features, train_labels, dev_labels =  train_test_split(com_train_features, com_train_labels, random_state=42)


### Some helper functions

In [None]:
def train_regressor(estimator, X_train, y_train, cv, name):
    cv_results = cross_validate(estimator,
                                 X_train,
                                y_train,
                                cv=cv, 
                                scoring="neg_mean_absolute_error",
                                return_train_score=True,
                                return_estimator=True)

    cv_train_error = -1*cv_results['train_score']
    cv_test_error = -1*cv_results['test_score']

    print(f"On an average, {name} makes an error of " 
            f"{cv_train_error.mean():.3f}k +/- {cv_train_error.std():.3f}k on the training set.")

    print(f"On an average, {name} makes an error of " 
            f"{cv_test_error.mean():.3f}k +/- {cv_test_error.std():.3f}k on the training set.")

### Decision tree regressor

In [None]:
train_regressor(DecisionTreeRegressor(), com_train_features, com_train_labels, cv, 'decision tree regressor')

### Bagging regressor

In [None]:
train_regressor(BaggingRegressor(), com_train_features, com_train_labels, cv, 'bagging regressor')

### RandomForest Regressor

In [None]:
train_regressor(RandomForestRegressor(), com_train_features, com_train_labels, cv, 'random forest regressor')

### Parameter search for random forest regressor

In [None]:
parameter_distributions = {"n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500], "max_leaf_nodes": [2, 5, 10, 20, 50, 100] }

search_cv = RandomizedSearchCV(RandomForestRegressor(n_jobs=2), 
                                param_distributions=parameter_distributions,
                                scoring="neg_mean_absolute_error", n_iter=10,
                                random_state=0, n_jobs=2,)

search_cv.fit(com_train_features, com_train_labels)

columns = [f"param_{name}" for name in parameter_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = -cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

In [None]:
error = -search_cv.score(test_features, test_labels)
print(f"On average, our random forest regressor makes an error of {error:.2f}k$")