In [1]:
import math

import numpy as np
import sklearn
from sklearn import tree
from sklearn import neural_network
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [2]:
sklearn.tree.DecisionTreeClassifier

sklearn.tree.tree.DecisionTreeClassifier

## 生成训练集和测试集

In [3]:
cancer_data = load_breast_cancer()
print(cancer_data)
x_train, x_test, y_train, y_test = train_test_split(cancer_data['data'], cancer_data['target'], test_size = 0.7)

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
 

## 自编决策树

In [145]:
def cal_shannon_ent(features_data, target_data):
    num_entries = len(features_data)
    label_counts = {}
    for each_label in target_data:
        if each_label not in label_counts:
            label_counts[each_label] = 0
        label_counts[each_label] += 1
    shannaon_ent = 0.0
    for each_label, each_count in label_counts.items():
        prob = each_count / num_entries
        shannaon_ent -= prob * math.log(prob, 2)
    return shannaon_ent

In [146]:
def split_dataset(features_data, feature_index, value):
    return_dataset = []
    for each_feature_data in features_data:
        if each_feature_data[feature_index] == value:
            reduced_feature_data = each_feature_data[:feature_index]
            reduced_feature_data = np.concatenate((reduced_feature_data, each_feature_data[feature_index + 1:]))
            return_dataset.append(np.array(reduced_feature_data))
    return return_dataset

In [147]:
def choose_best_feature_to_split(features_data, target_data):
    num_features = len(features_data[0])
    base_entropy = cal_shannon_ent(features_data, target_data)
    best_info_gain = 0.0
    best_feature = -1
    
    for feature_index in range(num_features):
        each_feature_value_list = [each_feature_data[feature_index] for each_feature_data in features_data]
        each_feature_value_set = set(each_feature_value_list)
        new_entropy = 0.0
        for each_value in each_feature_value_set:
            sub_dataset = split_dataset(features_data, feature_index, each_value)
            prob = len(sub_dataset) / len(features_data)
            new_entropy += prob * cal_shannon_ent(sub_dataset, target_data)
            
        info_gain = base_entropy - new_entropy
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = feature_index
    return best_feature

In [148]:
def create_decision_tree(features_data, target_data, feature_names, target_names):
    best_feature_index = choose_best_feature_to_split(features_data, target_data)
    best_feature_name = feature_names[best_feature_index]
    
    decision_tree = {
        best_feature_name: {}
    }
    feature_names = np.delete(feature_names, np.argwhere(feature_names == best_feature_name))
    feature_values = [each_feature_data[best_feature_index] for each_feature_data in features_data]
    unique_values = set(feature_values)
    for each_value in unique_values:
        print('start', len(feature_names), decision_tree)
        sub_labels = target_data[:]
        decision_tree[best_feature_name][each_value] = create_decision_tree(split_dataset(features_data, best_feature_index, each_value), sub_labels, feature_names, target_names)
        print('end', len(feature_names), decision_tree)
    print(best_feature_name)
    
    return decision_tree

my_decision_tree = create_decision_tree(x_train, y_train, cancer_data.feature_names, cancer_data.target_names)
print(my_decision_tree)

170
start 29 {'mean compactness': {}}
1
start 28 {'worst fractal dimension': {}}
1
start 27 {'worst symmetry': {}}
1
start 26 {'worst concave points': {}}
1
start 25 {'worst concavity': {}}
1
start 24 {'worst compactness': {}}
1
start 23 {'worst smoothness': {}}
1
start 22 {'worst area': {}}
1
start 21 {'worst perimeter': {}}
1
start 20 {'worst texture': {}}
1
start 19 {'worst radius': {}}
1
start 18 {'fractal dimension error': {}}


KeyboardInterrupt: 

## sklearn中决策树

In [23]:
sklearn_decision_tree = tree.DecisionTreeClassifier()
sklearn_decision_tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [25]:
sklearn_decision_tree.score(x_test, y_test)

0.899749373433584

In [40]:
import os
import pydot

files_dir_path = './files'
if not os.path.exists(files_dir_path):
    os.mkdir(files_dir_path)
sklearn_tree_dot_file = 'tree.dot'
sklearn_tree_dot_file_path = os.path.join(files_dir_path, sklearn_tree_dot_file)
if True or not os.path.exists(sklearn_tree_dot_file_path):
    tree.export_graphviz(sklearn_decision_tree, out_file=sklearn_tree_dot_file_path, class_names=['严重','轻微'],feature_names=cancer_data.feature_names,impurity=False,filled=True)
(graph, ) = pydot.graph_from_dot_file(sklearn_tree_dot_file_path)

sklearn_tree_png_file = 'tree.png'
sklearn_tree_png_file_path = os.path.join(files_dir_path, sklearn_tree_png_file)
graph.write_png(sklearn_tree_png_file_path)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe8 in position 148: ordinal not in range(128)