In [28]:
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
def load_data():
    iris = datasets.load_iris() #加载 iris 数据集
    iris_feature = iris.data #特征数据
    iris_target = iris.target #分类数据
    labels = ['sepal length', 'sepal width', 'petal length', 'petal width'] # 定义标签
    
    # 划分训练集和测试集
    feature_train, feature_test, target_train, target_test = train_test_split(iris_feature, iris_target, test_size=0.33, random_state=42)
    
    # 对训练集和测试集做简单处理，对连续float值转化为int处理，离散化连续变量
    traindata = np.rint( np.column_stack((feature_train,target_train)) )
    testdata = np.rint( np.column_stack((feature_test,target_test)) )
    return traindata,testdata

In [31]:
# 计算信息熵
def get_entropy(dataset):
    
    #创建所有类别列表：
    label_tags = list(set(dataset[:, -1]))
    label_length = len(dataset[:, -1])
    tmp_entropy = 0
    
    # 计算信息熵
    for label_tag in label_tags:
        tmp = sum([1 for d in dataset if d[-1]==label_tag])
        prob = tmp/label_length
        tmp_entropy += (prob)*np.math.log(prob, 2)
    entropy = -tmp_entropy
    return entropy

#计算某特征分类后的信息增益率
def get_rela_entropy(dataset, feature:int):
    
    # 生成某特征的取值范围
    feature_tags = list(set(dataset[:, feature]))
    print("feature_tags:",feature_tags)
    sub_entropy = 0.0
    iv = 0.0
    
    # 计算某个特征值的信息增益
    for feature_tag in feature_tags:
        sub_dataset = [d for d in dataset if d[feature]==feature_tag]
        sub_dataset = np.array(sub_dataset)
        
        # 计算某特征值的信息熵
        tmp_entropy = get_entropy(sub_dataset)
        
        # 计算每个结点的类别权重：
        prob = float(len(sub_dataset)/len(dataset))
        print("prob:",prob)
        iv -= float(prob*np.math.log(prob, 2))
        
        
        # 计算分类后带权值的信息熵总和
        sub_entropy += (prob) * tmp_entropy
    print("iv:",iv)
    # 用父节点信息熵减去分类后子结点的权值信息熵总和，得出按某特征分类后的信息增益
    rela_entropy = (get_entropy(dataset) - sub_entropy)/iv
    return rela_entropy

# 选取当前结点集合的最佳分类特征
def select_feature(dataset, features):
    rela_entropys = list()
    
    # 将每个特征的信息增益存储到列表中
    for feature in features:
        feature:int
        rela_entropy = get_rela_entropy(dataset, feature)
        rela_entropys.append(rela_entropy)
        
    # 返回最大信息增益的特征
    return features[rela_entropys.index(max(rela_entropys))]

# 返回当前结点的类别判定（判定为类中个数较多的）
def major_label(labels):
    
    # 生成当前结点的类别集合
    tags = list(set(labels))
    tag_num = [sum([1 for i in labels if i==label]) for label in tags]
    k = tag_num.index(max(tag_num))
    
    # 返回数量最多的类别
    return tags[k]

# 生成决策树，返回字典形式
def build_tree(dataset, features) -> dict:
    
    
    # 将当前结点的所有类别属性存入列表
    labels = dataset[:, -1]
    
    # 第一种停止情况：当前结点属于同一类别则返回类别标记
    if len(set(labels)) == 1:
        return {'label': labels[0]}
    
    # 第二种情况：当前结点划分属性用完了
    if not len(features):
        return {'label': major_label(labels)}
    
    # 第三种情况：当前结点的所有属性取值相等
    for feature in features:
        f_tags = list(set(dataset[:, feature]))
        if len(f_tags) == 1:
            return {'label': major_label(labels)}
        else:
            break
    
    # 选取根据信息增益选取最佳决策属性
    best_feature = select_feature(dataset, features)
    # print("best_feature:",best_feature)
    tree = {'feature': best_feature, 'children': {}}
    
    # 对该特征的每个取值进行决策划分
    feature_tags = list(set(dataset[:, best_feature]))
    for feature_tag in feature_tags:
        sub_dataset = [d for d in dataset if d[best_feature]==feature_tag]
        sub_dataset = np.array(sub_dataset)
        
        # 如果划分后的数据集为空，将其置为父结点类别
        if len(sub_dataset) == 0:
            tree['children'][feature_tag] = {'label_null': major_label(labels)}
        else:
            sub_features = [i for i in features if i != best_feature]
            # print("sub_feature:",sub_features)
            tree['children'][feature_tag] = build_tree(sub_dataset, sub_features)
    return tree

def classify(tree:dict, sample):
    for k, v in tree.items():
        if k != 'feature':
            return tree['label']
        else:
            return classify(tree['children'][sample[tree['feature']]], sample)

        
def classifier(tree:dict, features_data, default):
    predict_vec = list()
    for features_sample in features_data:
        try:
            predict = classify(tree, features_sample)
        except KeyError:
            predict = default
        predict_vec.append(predict)
    return predict_vec

if __name__=="__main__":
    train_data, test_data = load_data()
    tree = build_tree(train_data, list(range(train_data.shape[1]-1)))
    print(tree)
    test_data_labels = test_data[:, -1]
    test_data_features = test_data[:, :-1]
    default = major_label(test_data_labels)
    predict_vec = classifier(tree, test_data_features, default)
    print(predict_vec)
    accuracy = np.mean(np.array(predict_vec==test_data_labels))
    print(accuracy)

feature_tags: [4.0, 5.0, 6.0, 7.0, 8.0]
prob: 0.04
prob: 0.32
prob: 0.45
prob: 0.15
prob: 0.04
iv: 1.8264887070850935
feature_tags: [2.0, 3.0, 4.0]
prob: 0.14
prob: 0.73
prob: 0.13
iv: 1.1111974093459056
feature_tags: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
prob: 0.17
prob: 0.14
prob: 0.03
prob: 0.23
prob: 0.24
prob: 0.16
prob: 0.03
iv: 2.540049817564643
feature_tags: [0.0, 1.0, 2.0]
prob: 0.3
prob: 0.28
prob: 0.42
iv: 1.5609563153489605
feature_tags: [5.0, 6.0, 7.0]
prob: 0.21428571428571427
prob: 0.7142857142857143
prob: 0.07142857142857142
iv: 1.0949143184120975
feature_tags: [2.0, 3.0, 4.0]
prob: 0.32142857142857145
prob: 0.6428571428571429
prob: 0.03571428571428571
iv: 1.107784384952517
feature_tags: [2.0, 3.0, 4.0, 5.0, 6.0]
prob: 0.03571428571428571
prob: 0.10714285714285714
prob: 0.6785714285714286
prob: 0.14285714285714285
prob: 0.03571428571428571
iv: 1.469300984286476
feature_tags: [8.0, 5.0, 6.0, 7.0]
prob: 0.09523809523809523
prob: 0.047619047619047616
prob: 0.5476190476190477

0.0