# 机器学习与社会科学应用

# 第三章 经典分类算法

# 第四节 决策树算法

<font face="宋体" >郭峰    
    教授、博士生导师  
上海财经大学公共经济与管理学院  
上海财经大学数实融合与智能治理实验室  
邮箱：guofengsfi@163.com</font> 

<font face="宋体" >本节目录  
4.1. 分类决策树  
4.2. 分类决策树案例  
4.3. 回归决策树</font>

## 4.1. 分类决策树

In [None]:
from sklearn.datasets import load_iris  
from sklearn.model_selection import train_test_split  
from sklearn.tree import DecisionTreeClassifier  
from sklearn.metrics import accuracy_score, classification_report  

# 加载鸢尾花数据集  
data = load_iris()  
 
# 提取特征和标签  
X = data.data  
y = data.target  

# 划分训练集和测试集  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)  

# 创建决策树分类器  
dt_classifier = DecisionTreeClassifier()  

# 在训练集上训练模型  
dt_classifier.fit(X_train, y_train)  

# 在测试集上进行预测  
y_pred = dt_classifier.predict(X_test)  

# 评估模型性能  
accuracy = accuracy_score(y_test, y_pred)  
print("Accuracy:", accuracy)  

# 输出分类报告  
print("Classification Report:")  
print(classification_report(y_test, y_pred))  

In [None]:
from sklearn.datasets import load_iris  
from sklearn.model_selection import train_test_split, GridSearchCV  
from sklearn.tree import DecisionTreeClassifier  
from sklearn.metrics import accuracy_score, classification_report  

# 加载鸢尾花数据集  
data = load_iris()  

# 提取特征和标签  
X = data.data  
y = data.target  

# 划分训练集和测试集  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)  

# 创建决策树分类器  
dt_classifier = DecisionTreeClassifier()  

In [None]:
# 定义调参的超参数组合，以字典的形式指定参数和参数范围  
param_grid = {  
   'criterion': ['gini', 'entropy'],  
   'max_depth': [None, 5, 10, 15],  
   'min_samples_split': [2, 5, 10],  
   'min_samples_leaf': [1, 2, 5]  
}  

# 初始化网格搜索对象  
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, n_jobs=-1)  

# 在训练集上进行网格搜索，寻找最佳参数组合  
grid_search.fit(X_train, y_train)  

# 输出最佳参数组合和最佳模型  
print("Best Parameters:", grid_search.best_params_)  
best_model = grid_search.best_estimator_  

# 在测试集上进行预测  
y_pred = best_model.predict(X_test)  

# 评估模型性能  
accuracy = accuracy_score(y_test, y_pred)  
print("Accuracy:", accuracy)  

# 输出分类报告  
print("Classification Report:")  
print(classification_report(y_test, y_pred))  


## 4.2. 分类决策树案例

案例：坦特尼克号

###  数据预处理

In [None]:
# 数据预处理
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

path ="D:/python/机器学习与社会科学应用/演示数据/03经典分类算法/titanic/data/"
f1 = open(path+"train.csv",encoding='utf8')
data = pd.read_csv(path+"train.csv", encoding='utf8')
data.head()

In [None]:
# 数据预处理
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

path ="D:/python/机器学习与社会科学应用/演示数据/03经典分类算法/titanic/data/"
data = pd.read_csv(path+"train.csv", encoding='utf8')
data.head()

In [None]:
# 数据清理
# 丢弃无用的数据
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
# 处理性别数据
data['Sex'] = (data['Sex'] == 'male').astype('int')
# 处理登船港口数据
labels = data['Embarked'].unique().tolist()
data['Embarked'] = data['Embarked'].apply(lambda n: labels.index(n))
# 处理缺失数据
data = data.fillna(0)

### 划分数据集

In [None]:
from sklearn.model_selection import train_test_split

y = data['Survived'].values
X = data.drop(['Survived'], axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('train dataset: {0}; test dataset: {1}'.format(X_train.shape, X_test.shape))

###  训练模型

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print('train score: {0}; test score: {1}'.format(train_score, test_score))

###  调参--选择最优模型

决策树模型里面有一个参数max_depth控制决策树的最大深度，不同深度将会对模型产生不同的影响。
下面通过遍历不同的深度参数来训练决策树，并画出模型的socre曲线。

####  参数选择max_depth

In [None]:
# 参数选择 max_depth
def cv_score(d):
    clf = DecisionTreeClassifier(max_depth=d)
    clf.fit(X_train, y_train)
    tr_score = clf.score(X_train, y_train)
    cv_score = clf.score(X_test, y_test)
    return (tr_score, cv_score)

depths = range(2, 15)
scores = [cv_score(d) for d in depths]
tr_scores = [s[0] for s in scores]
cv_scores = [s[1] for s in scores]

In [None]:
# 选出最佳深度及score
best_score_index = np.argmax(cv_scores)
best_score = cv_scores[best_score_index]
best_param = depths[best_score_index]
print('best param: {0}; best score: {1}'.format(best_param, best_score))

In [None]:
# 画出不同深度对应的score
plt.figure(figsize=(10, 6), dpi=144)
plt.grid()
plt.xlabel('max depth of decision tree')
plt.ylabel('score')
plt.plot(depths, cv_scores, '.g-', label='test score')
plt.plot(depths, tr_scores, '.r--', label='training score')
plt.legend()


#### 不同阈值对决策树的影响

如果节点的分裂导致的不纯度的下降程度大于或者等于这个节点的值，那么这个节点将会被分裂。 决策树的参数min_impurity_decrease树早期生长的阈值。如果一个节点的不纯度超过阈值那么这个节点将会分裂，否则它还是一片叶子。

In [None]:
# 训练模型，并计算评分
def cv_score(val):
    clf = DecisionTreeClassifier(criterion='gini',min_impurity_decrease=val)
    clf.fit(X_train, y_train)
    tr_score = clf.score(X_train, y_train)
    cv_score = clf.score(X_test, y_test)
    return (tr_score, cv_score)

# 指定参数范围，分别训练模型，并计算评分
values = np.linspace(0, 0.01, 100)
scores = [cv_score(v) for v in values]
tr_scores = [s[0] for s in scores]
cv_scores = [s[1] for s in scores]

In [None]:
# 找出评分最高的模型参数
best_score_index = np.argmax(cv_scores)
best_score = cv_scores[best_score_index]
best_param = values[best_score_index]
print('best param: {0}; best score: {1}'.format(best_param, best_score))

In [None]:
# 画出模型参数与模型评分的关系
plt.figure(figsize=(10, 6), dpi=144)
plt.grid()
plt.xlabel('threshold of gini')
plt.ylabel('score')
plt.plot(values, cv_scores, '.g-', label='test score')
plt.plot(values, tr_scores, '.r--', label='training score')
plt.legend()


In [None]:
# 使用GridSearchCV类同时寻找最佳的min_impurity_decrease、criterion。
from sklearn.model_selection import GridSearchCV

entropy_thresholds = np.linspace(0, 1, 50)
gini_thresholds = np.linspace(0, 0.01, 100)

# Set the parameters by cross-validation
param_grid = [{'criterion': ['entropy'], 
               'min_impurity_decrease': entropy_thresholds},
              {'criterion': ['gini'], 
               'min_impurity_decrease': gini_thresholds},
              {'max_depth': range(2, 10)},
              {'min_samples_split': range(2, 30, 2)}]

clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
clf.fit(X, y)
print("best param: {0}\nbest score: {1}".format(clf.best_params_, 
                                                clf.best_score_))

## 4.3. 回归决策树

在sklearn中我们可以用来提高决策树泛化能力的超参数主要有：  
- max_depth:树的最大深度,也就是说当树的深度到达max_depth的时候无论还有多少可以分支的特征,决策树都会停止运算.
- min_samples_split: 分裂所需的最小数量的节点数.当叶节点的样本数量小于该参数后,则不再生成分支.该分支的标签分类以该分支下标签最多的类别为准 
- min_samples_leaf; 一个分支所需要的最少样本数,如果在分支之后,某一个新增叶节点的特征样本数小于该超参数,则退回,不再进行剪枝.退回后的叶节点的标签以该叶节点中最多的标签你为准
- min_weight_fraction_leaf: 最小的权重系数 
- max_leaf_nodes:最大叶节点数,None时无限制,取整数时,忽略max_depth
- 与分类决策树一样的地方在于,最大深度的增加虽然可以增加对训练集拟合能力的增强,但这也就可能意味着其泛化能力的下降

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt

# 创建随机数据集
rng = np.random.RandomState(1)
X = np.sort(10 * rng.rand(160, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 2 * (0.5 - rng.rand(32)) # 每五个点增加一次噪音

# 拟合回归决策树
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=5)
regr_3 = DecisionTreeRegressor(max_depth=8)
regr_1.fit(X, y)
regr_2.fit(X, y)
regr_3.fit(X, y)

# 预测
X_test = np.arange(0.0, 10.0, 0.01)[:, np.newaxis]
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)
y_3 = regr_3.predict(X_test)

# 画出结果
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black",
            c="darkorange", label="data")
plt.plot(X_test, y_1, color="cornflowerblue",
         label="max_depth=2", linewidth=2)
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.plot(X_test, y_3, color="r", label="max_depth=8", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()


In [None]:
# 本节结束