# CART tree

In [1]:
import numpy as np
import pandas as pd
import Ipynb_importer
import ID3_tree as ID3

importing Jupyter notebook from ID3_tree.ipynb


CART(Classfication and Regression Tree)是一种著名的决策树算法，分类和回归任务都可以使用。这里仅讨论分类任务。  
CART决策树采用“基尼指数”来选择划分属性
$$Gini\left(D\right)=\sum_{k=1}^{|y|}\sum_{j \neq k}p_kp_j=1-\sum_{k=1}^{|y|}p_k^2$$
直观来说，上式反映了从数据集$D$中随机抽取两个样本，其类别标记不一致的概率，所以该值越小，数据集$D$的纯度越高。

In [2]:
def Gini(data):
    '''
    计算基尼值
    @param data: 训练数据集
    @return gini: 训练数据集的基尼值
    '''
    classDict = ID3.countLabel(data[:, data.shape[1]-1])
    dict2list = np.array(list(classDict.values()))
    p_k = dict2list/dict2list.sum()
    gini = 1-(p_k**2).sum()
    
    return gini

数据集$D$的离散属性$a$的基尼指数定义为
$$Gini\_index\left(D, a\right)=\sum_{v=1}^{V}\frac{|D^v|}{|D|}Gini(D^v)$$
我们在侯选属性集合$A$中，选择那个使得划分后基尼指数最小的属性作为最优化分属性，即
$$a_*=\arg \min_{a \in A}Gini\_index\left(D, a\right)$$
和信息增益相同，对于连续属性$a$的基尼指数
$$Gini\_index\left(D, a\right)=\min_{t\in T_a}Gain\_index\left(D, a, t\right)=\min_{t\in T_a}\sum\limits_{\lambda \in \{-, +\}}\frac{|D_t^{\lambda}|}{|D|}Gini\left(D_t^{\lambda}\right)$$

In [3]:
def Gini_index(data, a):
    '''
    计算基尼指数
    @param data: 训练数据集
    @param a: 训练数据集属性
    @return gini_index: 基尼指数
    @return divide: 属性a的划分点
    '''
    data_a = data[:, a]
    data_a_sort = np.unique(data_a) #属性a删除重复值并排序
    data_a_sort.sort()
    first = np.delete(data_a_sort, len(data_a_sort)-1) #删除最后一个取值
    second = np.delete(data_a_sort, 0) #删除第一个取值
    T = (first + second)/2 #求得划分点t的集合
    
    gini_index = 100 #记录属性a的最小基尼指数
    divide = 0 #记录划分点t
    for t in T:
        index_negative = data[:, a]<t #子集Dt-的索引
        index_positive = ~index_negative #子集Dt+的索引
        negative = data[index_negative]
        positive = data[index_positive]
        
        sum = negative.shape[0]/data.shape[0] * Gini(negative) + positive.shape[0]/data.shape[0] * Gini(positive)
        if gini_index > sum:
            gini_index = sum
            divide = t
    
    return gini_index, divide

In [4]:
def createTree(data):
    '''
    依据基尼指数递归创建CART决策树
    @param data: 训练数据集
    @return node: 决策树
    '''
    classDict = ID3.countLabel(data[:, data.shape[1]-1])
    #当前集合的类别相同，则返回该类叶节点
    if len(classDict) == 1:
        return ID3.BTree(list(classDict)[0])
    
    #当前集合在所有属性上取值相同，则返回样本数最多的类别叶节点
    data_feature = data[:, 0:data.shape[1]-1]
    if (data_feature[0]==data_feature).astype(int).sum() == data_feature.shape[0]*data_feature.shape[1]:
        c = ID3.majorClass(data[:, data.shape[1]-1])
        return ID3.BTree(c)
    
    #求得每个属性的基尼指数和对应划分点
    gini_index_list = np.array([])
    divide_list = np.array([])
    for a in range(data.shape[1]-1):
        g, d = Gini_index(data, a)
        gini_index_list = np.append(gini_index_list, g)
        divide_list = np.append(divide_list, d)
        
    #选择最小基尼指数索引
    index = np.where(gini_index_list == gini_index_list.min())
    index = index[0][0]
    
    node = ID3.BTree(index, divide_list[index])
    
    #当前集合划分为子集Dt-和Dt+
    i = data[:, index] <= divide_list[index]
    data_negative = data[i]
    data_positive = data[~i]
    
    node.left = createTree(data_negative)
    node.right = createTree(data_positive)
    
    return node

In [5]:
if __name__ == '__main__':
    
    path = 'IRIS.csv'
    #path = '..\ex2-logistic_regression\ex2data3.txt'
    data = pd.read_csv(path)
    data_train, data_test = ID3.split_data(data.values, 0.2) #数据划分
    tree = createTree(data_train) #构造决策树
    
    #计算预测准确度
    mtrain, ntrain = ID3.predict(data_train, tree)
    accuracy_train = mtrain/(mtrain + ntrain)

    mtest, ntest = ID3.predict(data_test, tree)
    accuracy_test = mtest/(mtest + ntest)

    print ('train accuracy = {0}%'.format(accuracy_train*100))
    print ('test accuracy = {0}%'.format(accuracy_test*100))

train accuracy = 100.0%
test accuracy = 96.66666666666667%
