In [None]:
data = [
    ['Sunny', 'Hot', 'High', False, 'No'],
    ['Sunny', 'Hot', 'High', True, 'No'],
    ['Overcast', 'Hot', 'High', False, 'Yes'],
    ['Rain', 'Mild', 'High', False, 'Yes'],
    ['Rain', 'Cool', 'Normal', False, 'Yes'],
    ['Rain', 'Cool', 'Normal', True, 'No'],
    ['Overcast', 'Cool', 'Normal', True, 'Yes'],
    ['Sunny', 'Mild', 'High', False, 'No'],
    ['Sunny', 'Cool', 'Normal', False, 'Yes'],
    ['Rain', 'Mild', 'Normal', False, 'Yes'],
    ['Sunny', 'Mild', 'Normal', True, 'Yes'],
    ['Overcast', 'Mild', 'High', True, 'Yes'],
    ['Overcast', 'Hot', 'Normal', False, 'Yes'],
    ['Rain', 'Mild', 'High', True, 'No'],
]

features = ['Outlook', 'Temperature', 'Humidity', 'Windy']


import math
from collections import Counter

# 计算熵
def entropy(dataset):
    labels = [row[-1] for row in dataset]


    label_counts = Counter(labels)
    total = len(dataset)
    return -sum((count / total) * math.log2(count / total) for count in label_counts.values())

# 按特征划分数据集
def split_dataset(dataset, feature_index, value):
    return [row[:feature_index] + row[feature_index+1:] for row in dataset if row[feature_index] == value]

# 选择信息增益最大的特征
def choose_best_feature(dataset):
    base_entropy = entropy(dataset)
    best_info_gain = 0
    best_feature = -1
    num_features = len(dataset[0]) - 1

    for i in range(num_features):
        values = set(row[i] for row in dataset)
        new_entropy = 0

        for val in values:
            subset = [row for row in dataset if row[i] == val]
            prob = len(subset) / len(dataset)
            new_entropy += prob * entropy(subset)
        
        info_gain = base_entropy - new_entropy

        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = i

    return best_feature

# 递归构建树
def build_tree(dataset, features):
    labels = [row[-1] for row in dataset]

    # 如果都是同一个类，返回这个类
    if labels.count(labels[0]) == len(labels):
        return labels[0]

    # 如果没有特征了，返回最多的类
    if len(dataset[0]) == 1:
        return Counter(labels).most_common(1)[0][0]

    best_feat = choose_best_feature(dataset)
    best_feat_name = features[best_feat]
    tree = {best_feat_name: {}}
    feat_values = set(row[best_feat] for row in dataset)

    for value in feat_values:
        sub_features = features[:best_feat] + features[best_feat+1:]
        sub_dataset = split_dataset(dataset, best_feat, value)
        subtree = build_tree(sub_dataset, sub_features)
        tree[best_feat_name][value] = subtree

    return tree

# 可视化打印树（缩进格式）
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        for key, branches in tree.items():
            for val, subtree in branches.items():
                print(f"{indent}{key} = {val} ->")
                print_tree(subtree, indent + "  ")
    else:
        print(f"{indent}Predict: {tree}")

# 构建树 & 打印
decision_tree = build_tree(data, features)
print("构建的决策树：")

decision_tree

{'Rain', 'Overcast', 'Sunny'}


ZeroDivisionError: division by zero