# 决策树

![信息增益](../images/决策树/信息增益算法.jpg)
![信息增益](../images/决策树/信息增益算法2.jpg)

In [1]:
import sys
import time
import numpy as np
from collections import Counter
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [2]:
data = load_breast_cancer()
X, y = data['data'], data['target']
train_X, test_X, train_y, test_y = train_test_split(X, y)
print('train size: %s' % len(train_X))
print('test size: %s' % len(test_X))

train size: 426
test size: 143


In [3]:
def caculate_HD(train_y):
    # 计算数据集的经验墒
    hd = sum([-sum((train_y==i)) / len(train_y) * np.log2(sum((train_y==i)) / len(train_y)) 
              for i in np.unique(train_y)])
    return hd

In [4]:
def caculate_HDAs(train_X, train_y):
    # 合并方便后边取标签
    data = np.hstack([train_X, train_y.reshape(-1, 1)])
    # 计算每个特征的条件墒
    hdas = []
    classes = np.unique(train_y)
    # 对于每一个特征
    for a in range(train_X.shape[1]):
        hda = 0
        # 对于每一个特征的取值
        for ai in np.unique(train_X[:, a]):
            di = data[data[:, a] == ai]
            hdi = 0
            # 计算该取值的信息增益
            for c in classes:
                dik = di[di[:, -1] == c]
                if len(dik) == 0:
                    hdi += 0
                else:
                    hdi = len(dik) / len(di) * np.log2(len(dik) / len(di))
            # 计算每一个特征的信息增益
            hda += -len(di) / len(data) * hdi  
        hdas.append(hda)
        
    return hdas

In [5]:
def caculate_GDAs(train_X, train_y):
    hd = caculate_HD(train_y)
    hdas = caculate_HDAs(train_X, train_y)
    gdas = hd - np.asarray(hdas) 
    return gdas

In [6]:
def caculate_GrDAs(train_X, train_y):
    hd = caculate_HD(train_y)
    gdas = caculate_GDAs(train_X, train_y)
    return gdas / hd

![ID3](../images/决策树/ID3算法.jpg)
![ID3](../images/决策树/ID3算法2.jpg)

In [7]:
class DecisionTree:
    def __init__(self, train_X, train_y, episilon=0.1):
        self.train_X = train_X
        self.train_y = train_y
        self.episilon = episilon
        self.tree = self.__create(train_X, train_y)
        
    def __most(self, y):
        return Counter(y).most_common()[0][0]
    
    def __sub_big_equal(self, select, ai, train_X, train_y):
        data = np.hstack([train_X, train_y.reshape(-1, 1)])
        data = data[data[:, select] >= ai]
        return data[:, -2:], data[:, -1] 
    
    def __sub_small(self, select, ai, train_X, train_y):
        data = np.hstack([train_X, train_y.reshape(-1, 1)])
        data = data[data[:, select] != ai]
        return data[:, -2:], data[:, -1] 
        
    def __create(self, train_X, train_y):
        episilon = self.episilon
        if len(np.unique(train_y)) == 1:
            return train_y[0]
        if len(train_X[0]) == 0:
            return self.__most(train_y)
        
        gdas = caculate_GDAs(train_X, train_y)
        select = np.argmax(gdas)
        gda = gdas[select]
        if gda < episilon:
            return __most(train_y)
        
        tree = {select: {}}
        most = 0
        most_ai = 0
        for ai in np.unique(train_X[:, select]):
            di = train_X[train_X[:, select] == ai]
            if len(di) >= most:
                most = len(di)
                most_ai = ai
        
        tree[select][most_ai] = self.__create(*self.__sub_big_equal(select, ai, train_X, train_y))
        tree[select][-most_ai] = self.__create(*self.__sub_small(select, ai, train_X, train_y))
        return tree
    
    def __predict(self, test):
        tree = self.tree
        while True:
            key, value = list(tree.items())[0]
            if type(value).__name__ == 'dict':
                k = [k for k in value.keys() if k >= 0][0]
                if test[key] >= k:
                    tree = value[k]
                else:
                    tree = value[-k]
                if type(tree).__name__ != 'dict':
                    return tree
            else:
                return value
            
    def predict(self, test_X):
        res = []
        for test in test_X:
            res.append(self.__predict(test))
        return res

In [8]:
%time
tree = DecisionTree(train_X, train_y, 0.01)
print('算法准确率: ', sum(tree.predict(test_X) == test_y) / len(test_y))

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.05 µs
算法准确率:  0.7132867132867133
