# C4.5

# 基于NumPy的实现

In [1]:
import pandas as pd
from math import log2

In [2]:
# 定义信息熵计算函数
def entropy(ele):
    """
    输入:
    ele: 包含类别取值的列表
    输出:
    entropy: 信息熵值
    """
    # 计算列表中取值的概率分布
    probs = [ele.count(i) / len(ele) for i in set(ele)]
    # 计算信息熵
    entropy = -sum([prob * log2(prob) for prob in probs])
    return entropy

In [3]:
# 根据数据集和指定特征定义数据集划分函数
def df_split(df, col):
    """
    输入:
    df: 待划分的训练数据
    col: 划分数据的依据特征
    输出:
    res_dict: 根据特征取值划分后的不同数据集字典
    """
    # 获取依据特征的不同取值
    unique_col_val = df[col].unique()
    # 创建划分结果的数据集字典
    res_dict = {elem : pd.DataFrame for elem in unique_col_val}
    # 根据特征取值进行划分
    for key in res_dict.keys():
        res_dict[key] = df[:][df[col] == key]
    return res_dict

In [9]:
# 根据训练集和标签选择信息增益比最大的特征作为最优特征
def choose_best_feature(df, label):
    """
    输入:
    df: 待划分的训练数据
    label: 训练标签
    输出:
    max_value: 最大信息增益值
    best_feature: 最优特征
    max_split: 根据最优特征划分后的数据字典
    """
    # 计算训练标签的信息熵
    entropy_D = entropy(df[label].tolist())
    # 特征集
    cols = [col for col in df.columns if col not in [label]]
    # 初始化最大信息增益比值、最优特征和划分后的数据集
    max_value, best_feature, max_split = -999, None, None
    # 遍历特征并根据特征取值进行划分
    for col in cols:
        # 根据当前特征划分数据集
        split_set = df_split(df, col)
        # 初始化经验条件熵
        entropy_DA = 0
        # 对划分后的数据集遍历计算
        for subset_col, subset in split_set.items():
            # 计算划分后的数据子集的标签信息熵
            entropy_Di = entropy(subset[label].tolist())
            # 计算当前特征的经验条件熵
            entropy_DA += len(subset) / len(df) * entropy_Di
        # 计算当前特征的信息增益
        info_gain = entropy_D - entropy_DA
        # 计算数据集关于当前特征取值的信息熵
        entropy_AD = entropy(df[col].tolist())
        # 计算当前特征的信息增益比
        if entropy_AD:
            info_gain_ratio = info_gain / entropy_AD
            # 获取最大信息增益比，并保存对应的特征和划分结果
            if info_gain_ratio > max_value:
                max_value, best_feature, max_split = info_gain_ratio, col, split_set
    return max_value, best_feature, max_split

In [5]:
# C4.5算法类
class C45Tree:
    # 定义决策树结点类
    class TreeNode:
        # 定义树结点
        def __init__(self, name):
            self.name = name
            self.connections = {}
        # 定义树连接
        def connect(self, label, node):
            self.connections[label] = node

    # 定义全局变量，包括数据集、特征集、标签和根节点
    def __init__(self, df, label):
        self.columns = df.columns
        self.df = df
        self.label = label
        self.root = self.TreeNode("Root")

    # 构建树的调用
    def construct_tree(self):
        self.construct(self.root, "", self.df, self.columns)

    # 决策树构建方法
    def construct(self, parent_node, parent_label, sub_df, columns):
        # 选择最优特征
        max_value, best_feature, max_split = choose_best_feature(sub_df[columns], self.label)
        # 如果选不到最优特征，则构造单结点树
        if not best_feature:
            node = self.TreeNode(sub_df[self.label].iloc[0])
            parent_node.connect(parent_label, node)
            return
        # 根据最优特征及子结点构建树
        node = self.TreeNode(best_feature)
        parent_node.connect(parent_label, node)
        # 生成新的特征集
        new_columns = [col for col in columns if col != best_feature]
        # 递归地构造决策树
        for split_value, split_data in max_split.items():
            self.construct(node, split_value, split_data, new_columns)

    def print_tree(self, node, tabs):
        print(tabs + node.name)
        for connection, child_node in node.connections.items():
            print(tabs + "\t" + "(" + connection + ")")
            self.print_tree(child_node, tabs + "\t\t")

In [6]:
# 读取高尔夫数据集
df = pd.read_csv('example_data.csv', dtype={'windy': 'str'})
df.shape

(14, 5)

In [7]:
df.head()

Unnamed: 0,humility,outlook,play,temp,windy
0,high,sunny,no,hot,False
1,high,sunny,no,hot,True
2,high,overcast,yes,hot,False
3,high,rainy,yes,mild,False
4,normal,rainy,yes,cool,False


In [10]:
# 创建C4.5决策树实例
c45_tree = C45Tree(df, 'play')
# 构造C4.5决策树
c45_tree.construct_tree()
# 打印树
c45_tree.print_tree(c45_tree.root, "")

Root
	()
		outlook
			(sunny)
				humility
					(high)
						temp
							(hot)
								windy
									(false)
										no
									(true)
										no
							(mild)
								no
					(normal)
						temp
							(cool)
								yes
							(mild)
								yes
			(overcast)
				humility
					(high)
						temp
							(hot)
								yes
							(mild)
								yes
					(normal)
						temp
							(cool)
								yes
							(hot)
								yes
			(rainy)
				windy
					(false)
						humility
							(high)
								yes
							(normal)
								temp
									(cool)
										yes
									(mild)
										yes
					(true)
						humility
							(normal)
								no
							(high)
								no
