In [125]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 定义结点类
class Node:
    # val 结点值，内部结点：属性；叶结点：标签
    # tag 属性划分值
    def __init__(self, val=0.0, tag=None):
        self.val = val
        self.tag = tag
        # 左右子树
        self.lt = None
        self.rt = None
        
    def __str__(self):
        return f'val: {self.val}, tag: {self.tag}'

    
# 计算基尼指数
def Gini(label):
    gini = 1
    # kinds：类别的数量 count：某一类别出现的次数
    for (kinds, count) in zip(*np.unique(label, return_counts=True)):
        pk = count / len(label)
        gini += - pk**2
         
    return gini

 
# 计算连续值属性的最优二分
def get_best_split(col, label):
    # 连续值先进行排序
    sort_col = np.unique(np.sort(col, axis=0))
    # 计算连续值的划分点
    pos = (sort_col[1:] + sort_col[:-1])/2
    gini = float('inf') 
    split = 0 
    
    for i in pos:
        left = col < i
        right = col > i
        # 计算此划分的基尼系数
        gini_i = (sum(left) / len(label)) * Gini(label[left]) + \
               (sum(right) / len(label)) * Gini(label[right])
        # 更新最优划分
        if gini_i < gini:
            gini = gini_i
            split = i
            
    return gini, split


# 建立决策树模型
def buildTree(data, labels):
    # kinds：类别的数量 count：每一类别出现的次数
    kinds, cnts = np.unique(labels, return_counts=True)  
    # 若样本全部属于同一类别，结点标记为该类
    if len(kinds) == 1:
        return Node(kinds[0])
    # 如果样本数为 0，则返回空节点
    if data.shape[0] == 0:
        return None
    
    best_gini = float('inf')  # 最优基尼系数
    best_split = None  # 最优划分
    best_val = 0  # 最优划分点
    best_feature = None

    # 计算每个属性的基尼指数并选取最优划分
    for i in range(data.shape[1]):
        gini, split = get_best_split(data.iloc[:, i], labels)
        if gini < best_gini:
            best_gini = gini
            best_split = split # 连续值最优划分值
            best_val = i # 最优划分属性索引
            best_feature = data.columns[i] # 最优划分属性值

    if best_gini < 1e-3:
        return Node(kinds[cnts.argmax(0)])  # 返回最多的那个类

    # 初始化根结点
    features = list(data.columns)
    tree = Node(features[best_val], best_split)
    ss = [str(best_feature),str(best_val)]
    # ss = "-".join(ss)
    # print("ss: {}".format(ss))
    # dictTree = {ss:{}}
    # print("dictTree: {}".format(dictTree))
    # 连续值二分数据集为左右子树，递归建树
    left = data.iloc[:, best_val] < best_split
    right = data.iloc[:, best_val] > best_split
    tree.lt= buildTree(data[left], labels[left])
    tree.rt= buildTree(data[right], labels[right])
    return tree


# 根据决策树进行预测
def predict(root, x, i):
    tag = root.tag
    features = list(x.columns)
    while tag is not None:
        idx = features.index(root.val)
        if i[idx] < root.tag:
            root = root.lt
        else:
            root = root.rt
        tag = root.tag
    return root.val


# 测试函数
def test(tree, x_test, y_test, x_train):
    y_pred = []
    for index, row in x_test.iterrows():
        y_pred.append(predict(tree, x_train, row))

    y_pred = np.array(y_pred)
    accuracy = np.count_nonzero(y_test == y_pred) / len(y_test)
    return y_pred, accuracy


# 打印决策树
def get_dict_tree(tree):
    tree = tree
    node = "-".join([str(tree.val),str(tree.tag)])
    dictTree = {node:{}}
    # print(ss)
    if tree.lt == None and tree.rt == None:
        return tree.val
    dictTree[node]['小于'] = get_dict_tree(tree.lt)
    dictTree[node]['不小于'] = get_dict_tree(tree.rt)
    return dictTree

    
if __name__ == "__main__":
    # iris数据集
    iris = pd.read_excel("iris.xlsx",engine='openpyxl')
    x = iris.iloc[:, :-1]
    y = iris.iloc[:, -1]
    # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    x_train = pd.DataFrame(x_train, columns=iris.columns.values.tolist(), index=None)
    tree = buildTree(x_train, y_train)
    dicTree = get_dict_tree(tree)
    print("iris数据集:")
    print("决策树: {}".format(dicTree))
    y_pred, accuracy = test(tree, x_test, y_test, x_train)
    print("Accuracy: {}".format(accuracy))
    print("真实分类: {}".format(y_test.values))
    print("预测分类: {}".format(y_pred))
    
    # wineQuality数据集
    wine = pd.read_excel("wineQuality.xlsx",engine='openpyxl')
    wine = wine.iloc[:200, :]
    x = wine.iloc[:, :-1]
    y = wine.iloc[:, -1]
    # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    x_train = pd.DataFrame(x_train, columns=wine.columns.values.tolist(), index=None)
    tree = buildTree(x_train, y_train)
    dicTree = get_dict_tree(tree)
    print("\nwineQuality数据集:")
    print("决策树: {}".format(dicTree))
    y_pred, accuracy = test(tree, x_test, y_test, x_train)
    print("Accuracy: {}".format(accuracy))
    print("真实分类: {}".format(y_test.values))
    print("预测分类: {}".format(y_pred))


iris数据集:
决策树: {'petal_length-2.45': {'小于': 'Iris-setosa', '不小于': {'petal_width-1.75': {'小于': {'petal_length-5.05': {'小于': {'sepal_length-4.95': {'小于': 'Iris-versicolor', '不小于': 'Iris-versicolor'}}, '不小于': 'Iris-virginica'}}, '不小于': {'petal_length-4.85': {'小于': 'Iris-virginica', '不小于': 'Iris-virginica'}}}}}}
Accuracy: 0.9333333333333333
真实分类: ['Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica'
 'Iris-setosa' 'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
 'Iris-virginica' 'Iris-setosa' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'
 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica'
 'Iris-virginica' 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa'
 'Iris-versicolor' 'Iris-setosa']
预测分类: ['Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica'
 'Iris-setosa' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'
 'Iris-setosa' 'Iris-setosa' 'Iri

In [None]:
pip install openpyxl