In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import keras

In [2]:
np.random.seed(9)

In [3]:
df = keras.datasets.mnist.load_data(path="mnist.npz")

In [4]:
train, test = df[0], df[1]

In [5]:
train_x, train_y = train[0], train[1]

In [6]:
test_x, test_y = test[0], test[1]

In [8]:
train_x = train_x.reshape(-1, 784)
test_x = test_x.reshape(-1, 784)

In [9]:
print(train_x.shape)
print(test_x.shape)

(60000, 784)
(10000, 784)


In [87]:
# Initial 1/n for w.
init_w = 1/test_x.shape[0]
clfs = {}

for i in range(20):
    print(f'Running {i+1}')

    # Weight matrix
    weights = np.ones(test_x.shape[0]) * init_w

    # Initialization of classifier
    clf = DecisionTreeClassifier()
    clf.fit(train_x, train_y)
    print(np.round(clf.score(test_x, test_y) * 100, 2))
    # Get prediction and compare
    preds = clf.predict(test_x)
    
    # Total number of cases where pred is not equal to real label
    misclass_idx = test_y != preds
    miscalssification = ((test_y != preds) * weights)
    
    # Total cases
    total = test_y.shape[0]
    
    # Error
    error = np.sum(miscalssification)/total
    
    error_ratio = (1-error)/error
    lerner_error = 1/2*(np.log2(error_ratio))
    
    weights[misclass_idx] = weights[misclass_idx]*np.exp(lerner_error)
    weights = (weights - np.mean(weights)) / np.std(weights)
    clfs[i] = [clf, lerner_error, preds]

Running 1
87.77
Running 2
88.0
Running 3
87.81
Running 4
88.01
Running 5
87.9
Running 6
88.07
Running 7
87.6
Running 8
87.64
Running 9
87.48
Running 10
87.76
Running 11
87.91
Running 12
87.71
Running 13
87.81
Running 14
87.79
Running 15
87.76
Running 16
87.9
Running 17
87.81
Running 18
87.78
Running 19
87.8
Running 20
87.77


In [128]:
out = np.ones(test_x.shape[0])
lerners = []
avgs = []
for i, j in clfs.items():
    preds = j[2]
    lerner_error = j[1]
    lerners.append(lerner_error)
    avgs.append(preds * lerner_error)

In [129]:
len(lerners)

20

In [130]:
avgs

[array([57.11719449, 16.31919843,  8.15959921, ..., 32.63839685,
        40.79799607, 48.95759528]),
 array([57.21306065, 16.34658876,  8.17329438, ..., 32.69317751,
        40.86647189, 49.03976627]),
 array([57.13373667, 16.32392476,  8.16196238, ..., 32.64784952,
        40.8098119 , 48.97177429]),
 array([57.21727031, 16.34779152,  8.17389576, ..., 32.69558304,
        40.86947879, 49.04337455]),
 array([57.1711559 , 16.33461597,  8.16730799, ..., 32.66923194,
        40.83653993, 49.00384791]),
 array([57.24260227, 16.35502922,  8.17751461, ..., 32.71005844,
        40.88757305, 49.06508766]),
 array([57.04748863, 16.29928246,  8.14964123, ..., 32.59856493,
        40.74820616, 48.89784739]),
 array([57.06380365, 16.3039439 ,  8.15197195, ..., 32.6078878 ,
        40.75985975, 48.9118317 ]),
 array([56.99885747, 16.28538785,  8.14269392, ..., 32.5707757 ,
        40.71346962, 48.85616355]),
 array([57.1130674 , 16.31801926,  8.15900963, ..., 32.63603852,
        40.79504814, 48.95

In [131]:
clfs[0][2]

array([7, 2, 1, ..., 4, 5, 6], dtype=uint8)

In [144]:
npavgs = np.average(np.array(avgs), axis = 0)/np.average(lerners)

In [146]:
accuracy_score(test_y, np.int64(npavgs))

0.8479