## Decision Tree

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
Boston = datasets.load_boston()

X = Boston.data
y = Boston.target

### split training set and testing set

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

### Standardzation

In [4]:
from sklearn.preprocessing import StandardScaler

standard = StandardScaler()
standard.fit(X_train)
X_train_standard = standard.transform(X_train)
X_test_standard = standard.transform(X_test)

### 使用 sklearn 中的 decision tree

In [5]:
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train_standard, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

### 过拟合

In [6]:
# 使用 sklearn 封装好的评估指标
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

def caculate(y_true, y_predict):
    print('MSE: ', mean_squared_error(y_true, y_predict))
    print('RMSE: ', np.sqrt(mean_squared_error(y_true, y_predict)))
    print('MAE: ', mean_absolute_error(y_true, y_predict))
    print('R2: ', r2_score(y_true, y_predict))

In [7]:
# 训练集
y_train_prediction = dt_reg.predict(X_train_standard)
caculate(y_train, y_train_prediction)

MSE:  0.0
RMSE:  0.0
MAE:  0.0
R2:  1.0


In [8]:
# 测试集
y_test_prediction = dt_reg.predict(X_test_standard)
caculate(y_test, y_test_prediction)

MSE:  23.02245098039215
RMSE:  4.798171628901175
MAE:  3.2872549019607846
R2:  0.7799687103908468


## 使用 网格搜索 找到最好的参数

In [9]:
best_score = 0.0
best_depth = 0

for i in range(1, 21):
    dt_reg = DecisionTreeRegressor(max_depth=i)
    dt_reg.fit(X_train_standard, y_train)
    score = dt_reg.score(X_test_standard, y_test)
    if best_score < score:
        best_score = score
        best_depth = i
        
        best_train_predict = dt_reg.predict(X_train_standard)
        best_test_predict = dt_reg.predict(X_test_standard)

In [10]:
best_score

0.8433221596826636

In [11]:
best_depth

6

In [12]:
print('training set:')
caculate(y_train, best_train_predict)

training set:
MSE:  4.1524520233550195
RMSE:  2.0377566153383038
MAE:  1.5414760749116927
R2:  0.9473246368294238


In [13]:
print('testing set:')
caculate(y_test, best_test_predict)

testing set:
MSE:  16.393613403016357
RMSE:  4.048902740622002
MAE:  2.6645430899418274
R2:  0.8433221596826636
