## 作業

1. 試著調整 RandomForestClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型與決策樹的結果進行比較

In [1]:
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
# 讀取鳶尾花資料集
iris = datasets.load_iris()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=4)

# 建立模型
clf = RandomForestClassifier(n_estimators=3)

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred = clf.predict(x_test)

In [3]:
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)
print(iris.feature_names)
print("Feature importance: ", clf.feature_importances_)

Acuuracy:  0.973684210526
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [ 0.02527851  0.09177927  0.67758762  0.2053546 ]


## n_estimators tuning

In [4]:
n_es = [3,5,10,50,100]
acc={}
for n_es_sel in n_es:
    clf = RandomForestClassifier(n_estimators=n_es_sel,random_state=4)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    acc[n_es_sel] = metrics.accuracy_score(y_test, y_pred)
acc_pd = pd.DataFrame(data=acc,index=[0])
acc_pd

Unnamed: 0,3,5,10,50,100
0,0.947368,0.973684,0.973684,0.973684,0.973684


## criterion

In [5]:
crit = ['gini','entropy']
acc={}
for crit_sel in crit:
    clf = RandomForestClassifier(n_estimators=3,criterion=crit_sel,random_state=4)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    acc[crit_sel] = metrics.accuracy_score(y_test, y_pred)
acc_pd = pd.DataFrame(data=acc,index=[0])
acc_pd

Unnamed: 0,entropy,gini
0,0.973684,0.947368


## bootstrap

In [6]:
crit = [True,False]
acc={}
for crit_sel in crit:
    clf = RandomForestClassifier(n_estimators=3,bootstrap=crit_sel,random_state=4)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    acc[crit_sel] = metrics.accuracy_score(y_test, y_pred)
acc_pd = pd.DataFrame(data=acc,index=[0])
acc_pd

Unnamed: 0,False,True
0,0.973684,0.947368


## max_depth

In [7]:
crit = [2,3,4,5,6,8,10]
acc={}
for crit_sel in crit:
    clf = RandomForestClassifier(n_estimators=5,max_depth=crit_sel,random_state=4)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    acc[crit_sel] = metrics.accuracy_score(y_test, y_pred)
acc_pd = pd.DataFrame(data=acc,index=[0])
acc_pd

Unnamed: 0,2,3,4,5,6,8,10
0,0.868421,0.921053,0.973684,0.973684,0.973684,0.973684,0.973684


## 作業2

In [8]:
bc = datasets.load_breast_cancer()
bc_pd = pd.DataFrame(data=bc.data,columns=bc.feature_names)
bc_pd2 = (bc_pd - bc_pd.mean())/bc_pd.std()
bc_pd2.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,1.0961,-2.071512,1.268817,0.98351,1.567087,3.280628,2.650542,2.530249,2.215566,2.253764,...,1.885031,-1.358098,2.301575,1.999478,1.306537,2.614365,2.107672,2.294058,2.748204,1.935312
1,1.828212,-0.353322,1.684473,1.90703,-0.826235,-0.486643,-0.023825,0.547662,0.001391,-0.867889,...,1.80434,-0.368879,1.533776,1.888827,-0.375282,-0.430066,-0.14662,1.086129,-0.243675,0.280943
2,1.578499,0.455786,1.565126,1.557513,0.941382,1.052,1.36228,2.03544,0.938859,-0.397658,...,1.510541,-0.023953,1.346291,1.455004,0.526944,1.08198,0.854222,1.953282,1.151242,0.201214
3,-0.768233,0.253509,-0.592166,-0.763792,3.280667,3.399917,1.914213,1.450431,2.864862,4.906602,...,-0.281217,0.133866,-0.24972,-0.549538,3.391291,3.889975,1.987839,2.173873,6.040726,4.930672
4,1.748758,-1.150804,1.775011,1.824624,0.280125,0.538866,1.369806,1.427237,-0.009552,-0.561956,...,1.297434,-1.465481,1.337363,1.219651,0.220362,-0.313119,0.61264,0.728618,-0.86759,-0.396751


In [9]:
X = bc_pd2.values
y = bc.target
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)

# 建立模型
clf = RandomForestClassifier()

# 訓練模型
clf.fit(X_train, y_train)

# 預測測試集
y_pred = clf.predict(X_test)

In [10]:
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.937062937063
