# C4.5 tree

In [1]:
import numpy as np
import pandas as pd

In [2]:
path = 'IRIS.csv'

In [3]:
data = pd.read_csv(path)
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


集合$D$的信息熵，${P}_{k}$是当前集合样本$D$中第$k$类样本所占比例，${k}=1, 2, ..., {|y|}$。
$$ Ent\left(D\right)=-\sum\limits_{k=1}^{|y|}{P}_{k}{\log}_{2}{P}_{k}$$

当前集合$D$某一离散属性$a$的信息增益，$a$有$V$个可能取值$\{a^1, a^2, ..., a^v\}$
$$Gain\left(D, a\right)=Ent\left(D\right)-\sum\limits_{v=1}^{V}\frac{|D^v|}{|D|}Ent\left(D^v\right)$$

对于连续属性$a$的信息增益，假定$a$在$D$上出现了$n$个不同的取值，将这些值从小到大排列，记为$\{a^1, a^2, ...,a^n\}$。基于划分点$t$可将$D$分为子集$D_t^-$和$D_t^+$。
$$T_a=\left\{\frac{a^i+a^{i+1}}{2}|{1\leq i\leq n-1}\right\}$$
$$Gain\left(D, a\right)=\max_{t\in T_a}Gain\left(D, a, t\right)=\max_{t\in T_a}\left\{Ent\left(D\right)-\sum\limits_{\lambda \in \{-, +\}}\frac{|D_t^{\lambda}|}{|D|}Ent\left(D_t^{\lambda}\right)\right\}$$
需要注意的是，与离散属性不同，若当前结点的划分属性是连续属性，该属性还可作为其后代结点的划分属性。

ID3决策树使用信息增益来选择最优化分属性，但是信息增益准则对可取值数目较多的属性有所偏好。为了减少这种偏好带来的不利影响，C4.5决策树采用增益率来选择最优划分属性。
$$Gain\_ratio\left(D, a\right)=\frac{Gain\left(D, a\right)}{IV\left(a\right)}$$
其中$IV\left(a\right)$称为属性$a$的固有值（intrisic value），对于离散属性$a$
$$IV\left(a\right)=-\sum\limits_{v=1}^{V}\frac{|D^{v}|}{|D|}{\log}_{2}\frac{|D^{v}|}{|D|}$$
对于连续属性$a$
$$IV\left(a\right)=-\sum\limits_{\lambda \in \{-, +\}}\frac{|D^{\lambda}|}{|D|}{\log}_{2}\frac{|D^{\lambda}|}{|D|}$$
需要注意的是，增益率准则对可取值数目较少的属性有所偏好，所以C4.5决策树使用了一个启发式方法：先从候选划分属性中找出信息增益高于平均水平的属性，再从中选择增益率最高的。

In [4]:
import Ipynb_importer #jupyter notebook解析文件,方便调用其他upyter notebook文件
import ID3_tree as ID3

importing Jupyter notebook from ID3_tree.ipynb


In [5]:
def intrisic_value(data, divide):
    '''
    计算属性a的固有值
    @param data:训练数据集
    @param divide:属性a的最优化分点
    @return IV:属性a的固有值
    '''
    index_negative = data[:, 0]<divide #子集Dt-的索引
    index_positive = ~index_negative #子集Dt+的索引
    negative = data[index_negative]
    positive = data[index_positive]
    sumD = negative.shape[0] + positive.shape[0]
    #log()加上1e-6,防止溢出
    IV = -negative.shape[0]/sumD * np.log2(negative.shape[0]/sumD + 1e-6) \
        - positive.shape[0]/sumD * np.log2(positive.shape[0]/sumD + 1e-6)
    
    return IV

In [6]:
def information_gain_and_ratio(data, a):
    '''
    计算关于属性a的信息增益和增益率
    @param data: 训练数据集
    @param a: 训练数据集属性
    @return Gain: 训练数据集的信息增益
    @return divide: 属性a的划分点
    @return ratio: 属性a的增益率
    '''
    cols = data.shape[1]
    data_a = data[:, a]
    data_a_sort = np.unique(data_a) #属性a删除重复值并排序
    data_a_sort.sort()
    first = np.delete(data_a_sort, len(data_a_sort)-1) #删除最后一个取值
    second = np.delete(data_a_sort, 0) #删除第一个取值
    T = (first + second)/2 #求得划分点t
    data_section = np.take(data, [a, cols-1], axis=1) #获取数据a属性列和标签列
    
    #集合data的信息熵
    Ent = ID3.information_entropy(data[:, cols-1])
    
    #记录属性a的最小条件熵，初始值为该集合信息熵最大值log2(|D|)
    classDict = ID3.countLabel(data[:, cols-1])
    min = np.log2(len(classDict))
    divide=0 #记录划分点
    for t in T:
        index_negative = data_section[:, 0]<t #子集Dt-的索引
        index_positive = ~index_negative #子集Dt+的索引
        negative = data_section[index_negative]
        positive = data_section[index_positive]
        
        sum = (negative.shape[0]/data_section.shape[0])*ID3.information_entropy(negative[:, 1])\
                +(positive.shape[0]/data_section.shape[0])*ID3.information_entropy(positive[:, 1])
        if min > sum:
            min = sum
            divide = t
    
    IV = intrisic_value(data_section, divide) #属性a的固有值
    Gain = Ent-min #属性a的信息增益
    ratio = Gain/IV #属性a的增益率
    
    return Gain, divide, ratio

In [7]:
data_train, data_test = ID3.split_data(data.values, 0.2)
data_train.shape, data_test.shape

((120, 5), (30, 5))

In [8]:
information_gain_and_ratio(data_train, 0)

(0.6063301504633138, 5.55, 0.6214856084998776)

In [9]:
def createTree(data):
    '''
    依据信息增益和增益率递归创建C4.5决策树
    @param data: 训练数据集
    @return node: 决策树
    '''
    classDict = ID3.countLabel(data[:, data.shape[1]-1])
    #当前集合的类别相同，则返回该类叶节点
    if len(classDict) == 1:
        return ID3.BTree(list(classDict)[0])
    
    #当前集合在所有属性上取值相同，则返回样本数最多的类别叶节点
    data_feature = data[:, 0:data.shape[1]-1]
    if (data_feature[0]==data_feature).astype(int).sum() == data_feature.shape[0]*data_feature.shape[1]:
        c = ID3.majorClass(data[:, data.shape[1]-1])
        return ID3.BTree(c)
    
    #求得每个属性的信息增益、对应划分点和增益率
    gain_list = np.array([])
    divide_list = np.array([])
    ratio_list = np.array([])
    #求得每个属性的信息增益和对应划分点
    for a in range(data.shape[1]-1):
        g, d, r = information_gain_and_ratio(data, a)
        gain_list = np.append(gain_list, g)
        divide_list = np.append(divide_list, d)
        ratio_list = np.append(ratio_list, r)
        
    #求得大于信息增益均值的索引
    index = np.where(gain_list > gain_list.mean())
    index = index[0]
    #信息增益高于平均水平的属性，再从中选择增益率最高的
    index = np.where(ratio_list == ratio_list[index].max())
    index = index[0][0]
    
    node = ID3.BTree(index, divide_list[index])
    
    #当前集合划分为子集Dt-和Dt+
    i = data[:, index] <= divide_list[index]
    data_negative = data[i]
    data_positive = data[~i]
    
    node.left = createTree(data_negative)
    node.right = createTree(data_positive)
    
    return node

In [10]:
if __name__ == '__main__':
    #数据集划分
    data_train, data_test = ID3.split_data(data.values, 0.2)
    tree = createTree(data_train) #训练决策树
    
    #计算预测准确度
    mtrain, ntrain = ID3.predict(data_train, tree)
    accuracy_train = mtrain/(mtrain + ntrain)

    mtest, ntest = ID3.predict(data_test, tree)
    accuracy_test = mtest/(mtest + ntest)

    print ('train accuracy = {0}%'.format(accuracy_train*100))
    print ('test accuracy = {0}%'.format(accuracy_test*100))

train accuracy = 100.0%
test accuracy = 96.66666666666667%
