In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from scipy.sparse import coo_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import copy
import pandas as pd
from tqdm import tqdm
from typing import Counter
import math
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, recall_score, precision_score
import sys
import time


def load_data():
    data = loadmat("data/mnist_all.mat")

    # print(data.keys())

    train_data = pd.DataFrame()
    test_data = pd.DataFrame()

    for i in range(10):
        temp_df = pd.DataFrame(data["train" + str(i)])
        temp_df['label'] = i
        train_data = train_data.append(temp_df)
        temp_df = pd.DataFrame(data["test" + str(i)])
        temp_df['label'] = i
        test_data = test_data.append(temp_df)

    train_data = shuffle(train_data)
    test_data = shuffle(test_data)

    train_labels = np.array(train_data['label'])
    test_labels = np.array(test_data['label'])

    train_data = train_data.drop('label', axis=1)
    test_data = test_data.drop('label', axis=1)
    
    train_data = np.array(train_data) / 255
    test_data = np.array(test_data) / 255
    
    pca = PCA(0.95)
    pca.fit(train_data)
    train_data = pca.transform(train_data)
    test_data = pca.transform(test_data)

    return train_data, test_data, train_labels, test_labels


X_train, X_test, y_train, y_test = load_data()

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=0)

In [2]:
# 注意，经过PCA降维后，认为所有的特征都是连续值
X = np.concatenate((X_train, X_valid), axis=0)
y = np.concatenate((y_train, y_valid), axis=0)

In [3]:
print(X.shape, y.shape)

(60000, 154) (60000,)


In [4]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

sklearn_adaboost = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=150,
    algorithm="SAMME")
sklearn_adaboost.fit(X[:1000], y[:1000])

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(max_depth=2),
                   n_estimators=150)

In [5]:
predict_sklearn = sklearn_adaboost.predict(X_test)

In [6]:
print('sklearn test acc: {}'.format((sum(predict_sklearn == np.array(y_test)))/len(X_test)))

sklearn test acc: 0.7325


In [7]:
class CART_without_pruning:
    def __init__(self, epsilon):
        self.epsilon = epsilon
    
    
    def fit(self, X, y):
        self.tree = self.build_tree(X, y)
        
          
    def build_tree(self, X, y):
        # 如果所有的X都属于一个y_i
        temp_y = list(set(y))
        if len(temp_y) == 1:
            return temp_y[0]
 
        # 如果没有可以选择的划分属性？
        # 不会存在这种情况，因为连续属性可以重复用作划分属性
        
        best_feature_index, threshold, best_gini_index = self.choose_best_feature_to_split(X, y)
        
        if best_gini_index < self.epsilon:
            return Counter(y).most_common(1)[0][0]
        
        tree = {}
        x1, y1, x2, y2 = self.split_data(X, y, best_feature_index, threshold)
        tree[(best_feature_index, threshold, '<=')] = self.build_tree(x1, y1)
        tree[(best_feature_index, threshold, '>')] = self.build_tree(x2, y2)
        
        return tree
    
    
    def split_data(self, X, y, best_feature_index, threshold):
        
        x1, x2, y1, y2 = [], [], [], []
        
        for i in range(len(X)):
            if X[i][best_feature_index] <= threshold:
                x1.append(X[i])
                y1.append(y[i])
            else:
                x2.append(X[i])
                y2.append(y[i])
        
        return np.array(x1), np.array(y1), np.array(x2), np.array(y2)
    
    
    def predict(self, x):
        tree = self.tree
        
        while type(tree).__name__ == 'dict':
            
            for key in tree.keys():
                if key[2] == '<=':
                    key1 = key
                elif key[2] == '>':
                    key2 = key
                    
                
            feature_index = key1[0]
            threshold = key1[1]
            
            if x[feature_index] <= threshold:
                tree = tree[key1]
            elif x[feature_index] > threshold:
                tree = tree[key2]

        
        if type(tree).__name__ == 'int64' or type(tree).__name__ == 'int32':
            return tree
        else:
            pass
    
    
    def calculate_gini_index(self, feature_index, X, y):
        values = []
        for i in range(len(X)):
            values.append(X[i][feature_index])
        
        values = list(set(values))
        values.sort()
        
        best_gini_index = sys.maxsize
        best_threshold = None
        
        for i in range(len(values) - 1):
            threshold = (values[i] + values[i + 1])/2
            
            # D1和D2的作用是分别计算 <=threshold 和 >threshold 的X的数量
            D1, D2 = 0, 0
            
            # d1和d2的作用是分别计算D1和D2中各类标签（0-9）的数量
            d1, d2 = [0]*10, [0]*10
            
            for i in range(len(X)):
                if X[i][feature_index] <= threshold:
                    D1 += 1
                    d1[y[i]] += 1
                elif X[i][feature_index] > threshold:
                    D2 += 1
                    d2[y[i]] += 1
            
            # 下面计算gini index
            gini_D1 = 0
            gini_D2 = 0
            
            for i in range(10):
                gini_D1 += math.pow(d1[i]/D1, 2)
                gini_D2 += math.pow(d2[i]/D2, 2)
                
            gini_D1 = 1 - gini_D1
            gini_D2 = 1 - gini_D2
            
            gini_index = gini_D1*D1/len(X) + gini_D2*D2/len(X)
            
            if gini_index < best_gini_index:
                best_gini_index = gini_index
                best_threshold = threshold
                
        return best_gini_index, best_threshold
        
        
    # 根据西瓜书：
    # 需注意的是，与离散属性不同，若当前结点划分属性为连续属性，该属性还可作为其后代结点的划分属性
    # 因此不需要记录哪些属性（特征）已经使用过了
    def choose_best_feature_to_split(self, X, y):
        feature_num = X.shape[1]
        
        best_feature_index = -1
        best_gini_index = sys.maxsize
        best_feature_threshold = None
        
        for feature_index in range(feature_num):
            gini_index, threshold = self.calculate_gini_index(feature_index, X, y)
            
            if gini_index < best_gini_index:
                best_gini_index = gini_index
                best_feature_index = feature_index
                best_feature_threshold = threshold
        
        # 不会发生
        if best_feature_index == -1:
            pass
        
        return best_feature_index, best_feature_threshold, best_gini_index

In [8]:
class AdaBoost:
    def __init__(self, n_estimators):
        self.n_estimators = n_estimators
    
    def fit(self, X, y):
        self.N = X.shape[0]
        D = np.array([1/self.N]*self.N)
        
        self.estimators = []
        self.alpha = []
        
        for i in range(self.n_estimators):
            estimator = CART_without_pruning(epsilon=0.001)
            estimator.fit(X, y)
            
            G_m_x = []
            e_m = 0
            for i in range(self.N):
                temp_y = estimator.predict(X[i])
                
                G_m_x.append(temp_y)
                if temp_y != y[i]:
                    e_m += D[i]
            
            alpha_m = np.log((1-e_m)/e_m)/2
                
            Z_m = 0
            for i in range(self.N):
                Z_m += D[i]*np.exp(-alpha_m*y[i]*G_m_x[i])
            
            _D = []
            for i in range(self.N):
                _D.append(np.exp(-alpha_m*y[i]*G_m_x[i])*D[i]/Z_m)
            
            D = copy.deepcopy(_D)
            
            self.alpha.append(alpha_m)
            self.estimators.append(estimator)
            
    
    def predict(self, x):
        
        y = 0
        for i in range(self.n_estimators):
            y += self.alpha[i] * self.estimators[i].predict(x)
            
        return np.sign(y)

In [9]:
def make_data_of_digit_i_and_j(x, y, digit_i, digit_j):
    pos_sample_num = 0
    neg_sample_num = 0
    _data = []
    
    for i in range(len(x)):
        if y[i] == digit_i and pos_sample_num < 100:
            _data.append([x[i], 1])
            pos_sample_num += 1

        elif y[i] == digit_j and neg_sample_num < 100:
            neg_sample_num += 1
            _data.append([x[i], -1])

        else:
            continue
        
    shuffle(_data)
    data =  []
    labels =[]
    
    for i in range(len(_data)):
        data.append(_data[i][0])
        labels.append(_data[i][1])
    
    return data, labels

In [10]:
train_data, train_labels = make_data_of_digit_i_and_j(X, y, 1, 2)
test_data, test_labels = make_data_of_digit_i_and_j(X_test, y_test, 1, 2)
train_data = np.array(train_data)
test_data = np.array(test_data)

In [11]:
custom_adaboost = AdaBoost(n_estimators=3)
custom_adaboost.fit(train_data, train_labels)

In [12]:
predict_list = []
for i in tqdm(range(len(test_data))):
    p = custom_adaboost.predict(test_data[i])
    predict_list.append(p)

100%|█████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 22284.05it/s]


In [13]:
print('custom test acc: {}'.format((sum(predict_list == np.array(test_labels)))/len(test_labels)))

custom test acc: 0.965


In [14]:
models = {}

for i in range(10):
    for j in range(i+1, 10):
        
        print('* train model of {} and {}'.format(i, j))
        
        model = AdaBoost(n_estimators=3)
    
        train_data, train_labels = make_data_of_digit_i_and_j(X_train, y_train, i, j)
      
        train_data = np.array(train_data)
        
        model.fit(train_data, train_labels)
        
        models[str(i) + str(j)] = model

* train model of 0 and 1
* train model of 0 and 2
* train model of 0 and 3
* train model of 0 and 4
* train model of 0 and 5
* train model of 0 and 6
* train model of 0 and 7
* train model of 0 and 8
* train model of 0 and 9
* train model of 1 and 2
* train model of 1 and 3
* train model of 1 and 4
* train model of 1 and 5
* train model of 1 and 6
* train model of 1 and 7
* train model of 1 and 8
* train model of 1 and 9
* train model of 2 and 3
* train model of 2 and 4
* train model of 2 and 5
* train model of 2 and 6
* train model of 2 and 7
* train model of 2 and 8
* train model of 2 and 9
* train model of 3 and 4
* train model of 3 and 5
* train model of 3 and 6
* train model of 3 and 7
* train model of 3 and 8
* train model of 3 and 9
* train model of 4 and 5
* train model of 4 and 6
* train model of 4 and 7
* train model of 4 and 8
* train model of 4 and 9
* train model of 5 and 6
* train model of 5 and 7
* train model of 5 and 8
* train model of 5 and 9
* train model of 6 and 7


In [15]:
correct = 0
correct_list = []
error_list = []
for i in tqdm(range(len(X_test))):
    prob_list = []
    for key in models.keys():
        p = models[key].predict(X_test[i])
        
        py = None
        if p == 1:
            py = int(key[0])
        elif p == -1:
            py = int(key[1])
        
        prob_list.append(py)
    
    p = Counter(prob_list).most_common(1)[0][0]
    
    if p == y_test[i]:
        correct += 1
        correct_list.append(p)
    
    else:
        error_list.append([p, y_test[i]])
print('acc on test data of custom model: {}'.format(correct/len(X_test)))

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:07<00:00, 1320.91it/s]

acc on test data of custom model: 0.7216



