In [2]:
# iris 分类举例
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn import tree
import graphviz

# 准备数据集
iris = load_iris()

# 特征集和分类标识
features = iris.data
labels = iris.target

# 随机抽取 33% 的数据作为测试集，其余为训练集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=0)

# 创建 CART 分类树
clf = DecisionTreeClassifier(criterion="gini")

# 拟合构造 CART 分类树
clf_iris = clf.fit(train_features, train_labels)

# 用 CART 分类树做预测
test_predict = clf_iris.predict(test_features)

# 预测结果与测试集结果比较
score = accuracy_score(test_labels, test_predict)
print("CART 分类树准确率 %.4f" % score)

# 绘图
dot_data = tree.export_graphviz(clf_iris, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("iris_classification_tree")

CART 分类树准确率 0.9600


'iris_classification_tree.pdf'

In [14]:
# 波士顿房价 回归举例
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor

# 准备数据集
boston = load_boston()

# 探索数据
print(boston.feature_names)

# 获取特征集和房价
features = boston.data
prices = boston.target

# 随机抽取 33% 的数据作为测试集，其余为训练集
train_features, test_features, train_price, test_price = train_test_split(features, prices, test_size=0.33)

# 创建 CART 回归树
dtr = DecisionTreeRegressor()

# 拟合构造 CART 回归树
dtr_boston = dtr.fit(train_features, train_price)

# 预测测试集中的房价
predict_price = dtr_boston.predict(test_features)

# 测试集的结果评价
print("回归二乘偏差均值: ", mean_squared_error(test_price, predict_price))
print("回归绝对值偏差均值: ", mean_absolute_error(test_price, predict_price))

# 绘图
dot_data = tree.export_graphviz(dtr_boston, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("boston_regression_tree")

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
回归二乘偏差均值:  13.717844311377247
回归绝对值偏差均值:  2.61437125748503


'boston_regression_tree.pdf'

In [16]:
# 练习：手写数字 分类
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits
from sklearn import tree
import graphviz

# 准备数据集
digits = load_digits()

# 特征集和分类标识
features = digits.data
labels = digits.target

# 随机抽取 33% 的数据作为测试集，其余为训练集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=0)

# 创建 CART 分类树
clf = DecisionTreeClassifier(criterion="gini")

# 拟合构造 CART 分类树
clf_digits = clf.fit(train_features, train_labels)

# 用 CART 分类树做预测
test_predict = clf_digits.predict(test_features)

# 预测结果与测试集结果比较
score = accuracy_score(test_labels, test_predict)
print("CART 分类树准确率 %.4f" % score)

# 绘图
dot_data = tree.export_graphviz(clf_digits, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("digits_classification_tree")

CART 分类树准确率 0.8636


'digits_classification_tree.pdf'