## [作業重點]
目前你應該已經要很清楚資料集中，資料的型態是什麼樣子囉！包含特徵 (features) 與標籤 (labels)。因此要記得未來不管什麼專案，必須要把資料清理成相同的格式，才能送進模型訓練。
今天的作業開始踏入決策樹這個非常重要的模型，請務必確保你理解模型中每個超參數的意思，並試著調整看看，對最終預測結果的影響為何

## 作業

1. 試著調整 DecisionTreeClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型的結果進行比較

In [22]:
from sklearn import datasets, metrics

# 如果是分類問題，請使用 DecisionTreeClassifier，若為回歸問題，請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# 讀取鳶尾花資料集
iris = datasets.load_iris()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=4)

# 建立模型
clf = DecisionTreeClassifier()

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred_test = clf.predict(x_test)
y_pred_train = clf.predict(x_train)
acc_train = metrics.accuracy_score(y_train, y_pred_train)
print("Acuuracy_train: ", acc_train)

acc_test = metrics.accuracy_score(y_test, y_pred_test)
print("Acuuracy_test: ", acc_test)

print(iris.feature_names)
print("Feature importance: ", clf.feature_importances_)


Acuuracy_train:  1.0
Acuuracy_test:  0.9736842105263158
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.01796599 0.         0.05992368 0.92211033]


In [165]:
################# 作業 1 #######################################
### 目前 default train acc : 1, test_acc = 0.9736 , 其實已經相當準確, 看看是否能再提高 test_acc
### a.試著讓 training 不要學過多:
# 1. 如果減少 max_depth 會減少 Model overfitting, 但如果減少到2以下會 underfitting, 目前看來設定為 3 較好  
# 2. 如果增加 min_samples_split, min_samples_leaf, min_impurity_decrease, min_impurity_split 也可以減少 overtting 的情況
# ==> 但僅都降低 train acc, 無法提升 test acc
### b.試著調整class_weight, 以處理類別資料量不平衡的情況
# ==> class_weight 調整為 balanced 以處理類別資料量不平衡的情況 ==> 僅增加 training 準確率 

##################################################################
from sklearn import datasets, metrics

# 如果是分類問題，請使用 DecisionTreeClassifier，若為回歸問題，請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# 讀取鳶尾花資料集
iris = datasets.load_iris()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=4)

# 建立模型
################################### 參數設定 ###################################
##### 改變 Criterion #####
criterion = 'gini'

### 關於切割設定, 可加快速度/減少overfitting ###
splitter = 'best'
max_features = 0.75 
min_samples_split = 5
min_samples_leaf = 3
min_impurity_decrease = 0.001
min_impurity_split = 1e-7
min_weight_fraction_leaf = 0
### 關於模型架構 ###
max_depth = 3
max_leaf_nodes = None

### Others ###
class_weight = 'balanced'
presort = False
random_state = 0

##################################################################################
clf = DecisionTreeClassifier(criterion = criterion, min_samples_leaf=min_samples_leaf, max_depth = max_depth, max_features = max_features, min_samples_split = min_samples_split, min_impurity_decrease = min_impurity_decrease, min_impurity_split = min_impurity_split, class_weight =class_weight, presort = presort, random_state = random_state)

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred_test = clf.predict(x_test)
y_pred_train = clf.predict(x_train)
acc_train = metrics.accuracy_score(y_train, y_pred_train)
print("Acuuracy_train: ", acc_train)

acc_test = metrics.accuracy_score(y_test, y_pred_test)
print("Acuuracy_test: ", acc_test)

print(iris.feature_names)
print("Feature importance: ", clf.feature_importances_)

Acuuracy_train:  0.9821428571428571
Acuuracy_test:  0.9736842105263158
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.         0.         0.08469539 0.91530461]




In [196]:
############## 作業 2 ###############################
### 在 Wine 這比資料中 Tree 的 結果明顯比 Logistic Regression 好很多, logistic Regression 仍為 Under Fitting
### tree 的 train和 test acc皆已超過 0.9 
### Logistic Regression 仍需做 Feature selection 或其他參數調整才能再進一步提升準確率

###########################################################
##########   wine     ###################
from sklearn.linear_model import LinearRegression, LogisticRegression
wine = datasets.load_wine()
X = wine.data
y = wine.target
print(X.shape, y.shape)
### split data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

############## Linear Model ######################
#########################################
### find significance features

model1 = LogisticRegression(solver = 'saga', multi_class='multinomial')

model1.fit(X_train, y_train)
# 預測測試集
y_pred_test = model1.predict(X_test)
y_pred_train = model1.predict(X_train)
acc_train = metrics.accuracy_score(y_train, y_pred_train)
print("Acuuracy_train_1: ", acc_train)

acc_test = metrics.accuracy_score(y_test, y_pred_test)
print("Acuuracy_test_1: ", acc_test)

############  Tree #####################################################################
################################### 參數設定 ###################################
##### 改變 Criterion #####
criterion = 'gini'

### 關於切割設定, 可加快速度/減少overfitting ###
splitter = 'best'
max_features = 0.75 
min_samples_split = 5
min_samples_leaf = 3
min_impurity_decrease = 0.001
min_impurity_split = 1e-7
min_weight_fraction_leaf = 0
### 關於模型架構 ###
max_depth = 6
max_leaf_nodes = None

### Others ###
class_weight = 'balanced'
presort = False
random_state = 0

tree1 = DecisionTreeClassifier(criterion = criterion, min_samples_leaf=min_samples_leaf, max_depth = max_depth, max_features = max_features, min_samples_split = min_samples_split, min_impurity_decrease = min_impurity_decrease, min_impurity_split = min_impurity_split, class_weight =class_weight, presort = presort, random_state = random_state)
tree1.fit(X_train, y_train)

# 預測測試集
y_pred_test = tree1.predict(X_test)
y_pred_train = tree1.predict(X_train)
acc_train = metrics.accuracy_score(y_train, y_pred_train)
print("Acuuracy_train: ", acc_train)

acc_test = metrics.accuracy_score(y_test, y_pred_test)
print("Acuuracy_test: ", acc_test)

print(iris.feature_names)
print("Feature importance: ", tree1.feature_importances_)

(178, 13) (178,)
Acuuracy_train_1:  0.6901408450704225
Acuuracy_test_1:  0.75
Acuuracy_train:  0.9788732394366197
Acuuracy_test:  0.9166666666666666
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.         0.00818213 0.         0.         0.         0.
 0.00487901 0.         0.03219561 0.12169062 0.         0.42291308
 0.41013955]




In [426]:
#################### 作業 2 ##########################################
### 在 Boston 這比資料中 2 方法 testing r2 差不多, 但是 tree 很明顯有嚴重 overfitting, 需要進行參數調整以減少 overfitting 現象
### 而 regression model 為 under fitting 看來需要增加有意義(非線性關係) feature 以提升 r2

##########   boston     ###################
#########################################
#####################################
boston = datasets.load_boston()
X = boston.data
y = boston.target
print(X.shape, y.shape)

### split data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2 , random_state = 0)
################# Linear Model ##########################################
model2 = LinearRegression()
model2.fit(X_train, y_train)
y_pred_test = model2.predict(X_test)
y_pred_train = model2.predict(X_train)
print('training R2:', model2.score(X_train, y_train))
print('testing R2:', model2.score(X_test, y_test))

############### Tree ########################
##### 改變 Criterion #####
criterion = 'mse'

### 關於切割設定, 可加快速度/減少overfitting ###
splitter = 'best'
max_features = None 
min_samples_split = 25
min_samples_leaf = 10
min_impurity_decrease = 0
min_impurity_split = 0
min_weight_fraction_leaf = 0
### 關於模型架構 ###
max_depth = 10
max_leaf_nodes = 15

### Others ###
presort = False
random_state = 0
tree2 = DecisionTreeRegressor(criterion = criterion, min_samples_leaf=min_samples_leaf, max_depth = max_depth, max_features = max_features, min_samples_split = min_samples_split, min_impurity_decrease = min_impurity_decrease, min_impurity_split = min_impurity_split, presort = presort, random_state = random_state, max_leaf_nodes = max_leaf_nodes)
tree2.fit(X_train, y_train)

y_pred_test = tree2.predict(X_test)
y_pred_train = tree2.predict(X_train)
print('training R2:', tree2.score(X_train, y_train))
print('testing R2:', tree2.score(X_test, y_test))

(506, 13) (506,)
training R2: 0.7729718726571158
testing R2: 0.5892011519186435
training R2: 0.8592673987932495
testing R2: 0.6175234803465643


