In [None]:
import matplotlib.pyplot as plt

from keras.datasets import mnist
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, ShuffleSplit
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.pipeline import Pipeline

In [None]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# Flattening 28 * 28 pixels to 784 features.
X_train = X_train.reshape(60000, 28 * 28)
X_test = X_test.reshape(10000, 28 * 28)

X_train = X_train/255
X_test = X_test/255

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [None]:
def train_classifiers(estimator, X_train, y_train, cv, name):
  estimator.fit(X_train, y_train)
  cv_train_score = cross_val_score(estimator, X_train, y_train, cv=cv, scoring='f1_macro')
  print(f"On an average, {name} model has f1 score of {cv_train_score.mean():.3f} +/- {cv_train_score.std():.3f} on training set")

In [None]:
def eval(estimator, X_test, y_test):
  y_pred = estimator.predict(X_test)
  print(classification_report(y_test, y_pred))
  disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred))
  disp.plot()
  plt.title('Çonfusion Matrix')
  plt.show()  

## Decision Trees on MNIST multiclass classification

In [None]:
dt_pipeline = Pipeline([('classifier', DecisionTreeClassifier())])
train_classifiers(dt_pipeline, X_train, y_train, cv, "decision tree")

In [None]:
eval(dt_pipeline, X_test, y_test)

## MNIST classification with Bagging

In [None]:
bagging_pipeline = Pipeline([('classifier', BaggingClassifier())])
train_classifiers(bagging_pipeline, X_train, y_train, cv, 'bagging')

In [None]:
eval(bagging_pipeline, X_test, y_test)

## MNIST classification with Random Forest

In [None]:
rf_pipeline = Pipeline([('classifier', RandomForestClassifier())])
train_classifiers(rf_pipeline, X_train, y_train, cv, 'random forest')

In [None]:
eval(rf_pipeline, X_test, y_test)

# California Housing Dataset

In [None]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate, train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeRegressor

np.random.seed(306)

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [None]:
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
labels *= 100

In [None]:
com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)
train_features, dev_features, train_labels, dev_labels = train_test_split(com_train_features, com_train_labels, random_state=42)

In [None]:
def train_regressor(estimator, X_train, y_train, cv, name):
  cv_results = cross_validate(estimator, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error', return_train_score=True, return_estimator=True)
  cv_train_error = -1 * (cv_results['train_score'])
  cv_test_error = -1 * (cv_results['test_score'])
  print(f'On an average, {name} makes an error of {cv_train_error.mean():.3f}k +/- {cv_train_error.std():.3f}k on the training set')
  print(f'On an average, {name} makes an error of {cv_test_error.mean():.3f}k +/- {cv_test_error.std():.3f}k on the test set')

In [None]:
train_regressor(DecisionTreeRegressor(), com_train_features, com_train_labels, cv, 'decision tree regressor')

In [None]:
train_regressor(BaggingRegressor(), com_train_features, com_train_labels, cv, 'bagging regressor')

In [None]:
train_regressor(RandomForestRegressor(), com_train_features, com_train_labels, cv, 'random forest regressor')

## Hyper-parameter tuning

In [None]:
param_distributions = {'n_estimators': [1,2,5,10,20,40,100,200,500], 'max_leaf_nodes': [2,5,10,20,50,100]}
search_cv = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_distributions, 
                               scoring='neg_mean_absolute_error', n_iter=10, random_state=10)

search_cv.fit(com_train_features, com_train_labels)

In [None]:
columns = [f'param_{name}' for name in param_distributions.keys()]
columns += ['mean_test_error', 'std_test_error']
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results['mean_test_error'] = -cv_results['mean_test_score']
cv_results['std_test_error'] = -cv_results['std_test_score']
cv_results[columns].sort_values(by='mean_test_error')

In [None]:
error = -search_cv.score(test_features, test_labels)
print(f'On average, our random forest regressor makes an error of {error: .2f}k$')

# MNIST classification with AdaBoost and GradientBoost

In [None]:
import matplotlib.pyplot as plt

from keras.datasets import mnist
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, ShuffleSplit
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.pipeline import Pipeline

In [None]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# Flattening 28 * 28 pixels to 784 features.
X_train = X_train.reshape(60000, 28 * 28)
X_test = X_test.reshape(10000, 28 * 28)

X_train = X_train/255
X_test = X_test/255

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [None]:
def train_classifiers(estimator, X_train, y_train, cv, name):
  estimator.fit(X_train, y_train)
  cv_train_score = cross_val_score(estimator, X_train, y_train, cv=cv, scoring='f1_macro')
  print(f"On an average, {name} model has f1 score of {cv_train_score.mean():.3f} +/- {cv_train_score.std():.3f} on training set")

In [None]:
def eval(estimator, X_test, y_test):
  y_pred = estimator.predict(X_test)
  print(classification_report(y_test, y_pred))
  disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred))
  disp.plot()
  plt.title('Çonfusion Matrix')
  plt.show()  

## AdaBoostClassifier

In [None]:
abc_pipeline = Pipeline([('abc_classifier', AdaBoostClassifier())])
train_classifiers(abc_pipeline, X_train, y_train, cv, 'AdaBoostClassifier')

In [None]:
eval(abc_pipeline, X_test, y_test)

## GradientBoostingClassifier

In [None]:
gbc_pipeline = Pipeline([('gbc_classifier', GradientBoostingClassifier())])
train_classifiers(gbc_pipeline, X_train, y_train, cv, 'GradientBoostingClassifier')

In [None]:
eval(gbc_pipeline, X_test, y_test)

## XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
xbc_pipeline = Pipeline([('xbc_classifier', XGBClassifier())])
train_classifiers(xbc_pipeline, X_train, y_train, cv, 'XGBClassifier')

In [None]:
eval(xbc_pipeline, X_test, y_test)

# California Housing with AdaBoost and GradientBoost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV, ShuffleSplit
from sklearn.metrics import mean_absolute_error

In [None]:
np.random.seed(306)

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [None]:
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
labels *= 100

In [None]:
com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)
train_features, dev_features, train_labels, dev_labels = train_test_split(com_train_features, com_train_labels, random_state=42)

In [None]:
def train_regressor(estimator, X_train, y_train, cv, name):
  cv_results = cross_validate(estimator, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error', return_train_score=True, return_estimator=True)
  cv_train_error = -1 * (cv_results['train_score'])
  cv_test_error = -1 * (cv_results['test_score'])
  print(f'On an average, {name} makes an error of {cv_train_error.mean():.3f}k +/- {cv_train_error.std():.3f}k on the training set')
  print(f'On an average, {name} makes an error of {cv_test_error.mean():.3f}k +/- {cv_test_error.std():.3f}k on the test set')

## AdaBoostRegressor

In [None]:
train_regressor(AdaBoostRegressor(), com_train_features, com_train_labels, cv, 'AdaBoostRegressor')

In [None]:
train_regressor(GradientBoostingRegressor(), com_train_features, com_train_labels, cv, 'GradientBoostingRegressor')

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(objective='reg:squarederror')
train_regressor(xgb, com_train_features, com_train_labels, cv, 'XGBRegressor')