<a href="https://colab.research.google.com/github/carvalheirafc/mnist-MachineLearning-Supervised/blob/main/boosting_bagging_mnist_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import 

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
import time

### Data Input and miscs


In [2]:
data_X = np.load('./drive/My Drive/mnist_datasets/mnist_X.npy')
data_y = np.load('./drive/My Drive/mnist_datasets/mnist_labels.npy')

In [3]:
half_X = np.array(np.split(data_X, 2))
half_y = np.array(np.split(data_y, 2))

print('Mnist 50% Data: \n')
print('X: ', half_X[0].shape)
print('y: ', half_y[0].shape)

Mnist 50% Data: 

X:  (30000, 1, 28, 28)
y:  (30000, 1)


In [4]:
smaller_X = np.array(np.split(data_X, 5))
smaller_y = np.array(np.split(data_y, 5))

print('Mnist 20% Data: \n')
print('X: ', smaller_X[0].shape)
print('y: ', smaller_y[0].shape)

Mnist 20% Data: 

X:  (12000, 1, 28, 28)
y:  (12000, 1)


In [5]:
data_two_classes = np.load('./drive/My Drive/mnist_datasets/two_classes_data.npy', allow_pickle=True)
two_classes_X = np.concatenate((data_two_classes[0][0], data_two_classes[1][0]), axis=0)
two_classes_y = np.concatenate((data_two_classes[0][1], data_two_classes[1][1]), axis=0)
print('Mnist Two Classes: \n')
print('Zeros: ', data_two_classes[0][0].shape)
print('Ones:  ', data_two_classes[1][0].shape)
print('X:     ', two_classes_X.shape)
print('y:     ', two_classes_y.shape)


Mnist Two Classes: 

Zeros:  (5923, 28, 28)
Ones:   (6742, 28, 28)
X:      (12665, 28, 28)
y:      (12665,)


### Hold-Out Sets


In [6]:
ho_original_91_train_X, ho_original_91_test_X, ho_original_91_train_y, ho_original_91_test_y = train_test_split(data_X, data_y, train_size=0.9, random_state=555)
ho_original_82_train_X, ho_original_82_test_X, ho_original_82_train_y, ho_original_82_test_y = train_test_split(data_X, data_y, train_size=0.8, random_state=555)
ho_original_73_train_X, ho_original_73_test_X, ho_original_73_train_y, ho_original_73_test_y = train_test_split(data_X, data_y, train_size=0.7, random_state=555)

print('Hold-Out Original Data Split\n')
print('9/1: ', ho_original_91_train_X.shape)
print('8/2: ', ho_original_82_train_X.shape)
print('7/3: ', ho_original_73_train_X.shape)

Hold-Out Original Data Split

9/1:  (54000, 1, 28, 28)
8/2:  (48000, 1, 28, 28)
7/3:  (42000, 1, 28, 28)


In [7]:
ho_two_class_91_train_X, ho_two_class_91_test_X, ho_two_class_91_train_y, ho_two_class_91_test_y = train_test_split(two_classes_X, two_classes_y, train_size=0.9, random_state=555)
ho_two_class_82_train_X, ho_two_class_82_test_X, ho_two_class_82_train_y, ho_two_class_82_test_y = train_test_split(two_classes_X, two_classes_y, train_size=0.8, random_state=555)
ho_two_class_73_train_X, ho_two_class_73_test_X, ho_two_class_73_train_y, ho_two_class_73_test_y = train_test_split(two_classes_X, two_classes_y, train_size=0.7, random_state=555)

print('Hold-Out Mnist Two Classes[0, 1] Split\n')
print('9/1: ', ho_two_class_91_train_X.shape)
print('8/2: ', ho_two_class_82_train_X.shape)
print('7/3: ', ho_two_class_73_train_X.shape)

Hold-Out Mnist Two Classes[0, 1] Split

9/1:  (11398, 28, 28)
8/2:  (10132, 28, 28)
7/3:  (8865, 28, 28)


In [8]:
ho_smaller_91_train_X, ho_smaller_91_test_X, ho_smaller_91_train_y, ho_smaller_91_test_y = train_test_split(smaller_X[0], smaller_y[0], train_size=0.9, random_state=555)
ho_smaller_82_train_X, ho_smaller_82_test_X, ho_smaller_82_train_y, ho_smaller_82_test_y = train_test_split(smaller_X[0], smaller_y[0], train_size=0.8, random_state=555)
ho_smaller_73_train_X, ho_smaller_73_test_X, ho_smaller_73_train_y, ho_smaller_73_test_y = train_test_split(smaller_X[0], smaller_y[0], train_size=0.7, random_state=555)

print('Hold-Out Half Data Split\n')
print('9/1: ', ho_smaller_91_train_X.shape)
print('8/2: ', ho_smaller_82_train_X.shape)
print('7/3: ', ho_smaller_73_train_X.shape)

Hold-Out Half Data Split

9/1:  (10800, 1, 28, 28)
8/2:  (9600, 1, 28, 28)
7/3:  (8400, 1, 28, 28)


### Custom MLP 

In [9]:
'''
Default ccp_alfa(Fator de confiança)
'''

default_tree_classifier = tree.DecisionTreeClassifier(ccp_alpha=0.0)

### Bagging

In [10]:
'''
Two Classes Data-set
'''
estimators = [10, 15, 20]
for estimator in estimators:
  clf = BaggingClassifier(base_estimator=default_tree_classifier, n_estimators=estimator, random_state=555)
  print('Bagging Estimators: ', estimator)
  print('HiddenLayers (100,) | Two Class Data-set | 7/3 Holdout')
  start_time = time.time()
  clf.fit(ho_two_class_73_train_X.reshape(ho_two_class_73_train_X.shape[0], -1), ho_two_class_73_train_y)
  print("Execution Time: %s seconds" % (time.time() - start_time))
  ho_two_class_73_pred_y = clf.predict(ho_two_class_73_test_X.reshape(ho_two_class_73_test_X.shape[0], -1))
  matrix = metrics.confusion_matrix(ho_two_class_73_test_y, ho_two_class_73_pred_y)
  print('Acurácia:', np.trace(matrix) / len(ho_two_class_73_test_y) * 100, '%')
  print('-------------------------------------------------- \n\n')

Bagging Estimators:  10
HiddenLayers (100,) | Two Class Data-set | 7/3 Holdout
Execution Time: 4.683970928192139 seconds
Acurácia: 99.68421052631578 %
-------------------------------------------------- 


Bagging Estimators:  15
HiddenLayers (100,) | Two Class Data-set | 7/3 Holdout
Execution Time: 7.23342752456665 seconds
Acurácia: 99.60526315789474 %
-------------------------------------------------- 


Bagging Estimators:  20
HiddenLayers (100,) | Two Class Data-set | 7/3 Holdout
Execution Time: 9.605384588241577 seconds
Acurácia: 99.6578947368421 %
-------------------------------------------------- 




In [11]:
'''
Smaller Size and Holdout sets
'''

estimators = [10, 15, 20]
for estimator in estimators:
  clf = BaggingClassifier(base_estimator=default_tree_classifier, n_estimators=estimator, random_state=555)
  print('Bagging Estimators: ', estimator)
  print('HiddenLayers (100,)  | Smaller Data-set | 7/3 Holdout')
  start_time = time.time()
  clf.fit(ho_smaller_73_train_X.reshape(ho_smaller_73_train_X.shape[0], -1), ho_smaller_73_train_y.ravel())
  print("Execution Time: %s seconds" % (time.time() - start_time))
  ho_smaller_73_pred_y = clf.predict(ho_smaller_73_test_X.reshape(ho_smaller_73_test_X.shape[0], -1))
  matrix = metrics.confusion_matrix(ho_smaller_73_test_y, ho_smaller_73_pred_y)
  print('Acurácia:', np.trace(matrix) / len(ho_smaller_73_test_y) * 100, '%')
  print('-------------------------------------------------- \n\n')

Bagging Estimators:  10
HiddenLayers (100,)  | Smaller Data-set | 7/3 Holdout
Execution Time: 11.036916971206665 seconds
Acurácia: 90.77777777777779 %
-------------------------------------------------- 


Bagging Estimators:  15
HiddenLayers (100,)  | Smaller Data-set | 7/3 Holdout
Execution Time: 16.661185264587402 seconds
Acurácia: 91.25 %
-------------------------------------------------- 


Bagging Estimators:  20
HiddenLayers (100,)  | Smaller Data-set | 7/3 Holdout
Execution Time: 22.186914682388306 seconds
Acurácia: 92.11111111111111 %
-------------------------------------------------- 




### Boosting 

In [13]:
'''
Two Classes Data-set
'''
estimators = [10, 15, 20]
for estimator in estimators:
  clf = AdaBoostClassifier(base_estimator=default_tree_classifier, n_estimators=estimator, random_state=555)
  print('Boosting Estimators: ', estimator)
  print('HiddenLayers (100,) | Two Class Data-set | 7/3 Holdout')
  start_time = time.time()
  clf.fit(ho_two_class_73_train_X.reshape(ho_two_class_73_train_X.shape[0], -1), ho_two_class_73_train_y)
  print("Execution Time: %s seconds" % (time.time() - start_time))
  ho_two_class_73_pred_y = clf.predict(ho_two_class_73_test_X.reshape(ho_two_class_73_test_X.shape[0], -1))
  matrix = metrics.confusion_matrix(ho_two_class_73_test_y, ho_two_class_73_pred_y)
  print('Acurácia:', np.trace(matrix) / len(ho_two_class_73_test_y) * 100, '%')
  print('-------------------------------------------------- \n\n')

Boosting Estimators:  10
HiddenLayers (100,) | Two Class Data-set | 7/3 Holdout
Execution Time: 1.1108434200286865 seconds
Acurácia: 99.5 %
-------------------------------------------------- 


Boosting Estimators:  15
HiddenLayers (100,) | Two Class Data-set | 7/3 Holdout
Execution Time: 1.1161816120147705 seconds
Acurácia: 99.5 %
-------------------------------------------------- 


Boosting Estimators:  20
HiddenLayers (100,) | Two Class Data-set | 7/3 Holdout
Execution Time: 1.105041742324829 seconds
Acurácia: 99.5 %
-------------------------------------------------- 


