# Decision Tree

In [1]:
import pandas
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

## Making Data and Split Train/Test Data

In [3]:
automobile = pandas.read_csv('../1015_Numpy&Pandas/automobile.csv')
variables = ['bore', 'city_mpg', 'compression_ratio', 'curb_weight', 'engine_size',
             'horsepower', 'peak_rpm', 'city_mpg', 'price']
X = automobile[variables]
y = automobile['doors']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [4]:
tree = DecisionTreeClassifier()

 - max_features : 질문할 수 있는 갯수를 제한할 수 있다.
 - max_depth : 깊이제한.
 - max_leaf_nodes : leaf Node의 갯수를 제한할 수있다. 

In [5]:
tree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [6]:
y_tree = tree.predict(X_test)

## 모형 평가

In [7]:
metrics.confusion_matrix(y_test,y_tree)

array([[26, 13],
       [10, 15]])

In [8]:
metrics.accuracy_score(y_test, y_tree)

0.640625

In [9]:
metrics.precision_score(y_test, y_tree, pos_label='four')

0.72222222222222221

In [10]:
metrics.recall_score(y_test, y_tree, pos_label='four')

0.66666666666666663

In [11]:
metrics.f1_score(y_test, y_tree, pos_label='four')

0.69333333333333336

 - 입력된 변수의 순거에 따라 중요도 표시 

In [12]:
tree.feature_importances_

array([ 0.05872253,  0.06238553,  0.        ,  0.43066428,  0.12313238,
        0.04349817,  0.04956155,  0.        ,  0.23203556])

# 앙상블 (Ensemble)
## Random Forest (랜덤포레스트)
 - 하나의 모델일 경우 Under Fitting or Overfitting 가능성이 너무 높다 
 - 주식 시장의 포트폴리오와 비슷한 개념이다. ( 재태크의 기본 )

In [13]:
from sklearn.ensemble import RandomForestClassifier

 - n_estimators : Tree의 수 , random_state 수업시간에 결과가 같아야 하므로 고정.

In [14]:
rf = RandomForestClassifier(n_estimators=10, random_state=0)

In [15]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [16]:
y_rf = rf.predict(X_test)

# 모델평가

In [17]:
metrics.confusion_matrix(y_test,y_rf)

array([[29, 10],
       [11, 14]])

In [18]:
metrics.accuracy_score(y_test,y_rf)

0.671875

In [19]:
metrics.precision_score(y_test, y_rf, pos_label='four')

0.72499999999999998

In [20]:
metrics.recall_score(y_test, y_rf, pos_label='four')

0.74358974358974361

In [21]:
metrics.f1_score(y_test, y_rf, pos_label='four')

0.73417721518987333

# Gradient Boosting Tree
 - <a href="https://xgboost.readthedocs.io/en/latest/">XGBoost</a>가 가장 좋은 library
 - 설치가 어렵다 ( 해보자 )

In [22]:
from sklearn.ensemble import GradientBoostingClassifier

In [23]:
gb = GradientBoostingClassifier(n_estimators=10, random_state=0)

In [24]:
gb.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=10, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)

In [25]:
y_gb = gb.predict(X_test)

## 모형 평가 

In [26]:
metrics.confusion_matrix(y_test,y_gb)

array([[30,  9],
       [11, 14]])

In [27]:
metrics.accuracy_score(y_test,y_gb)

0.6875

In [28]:
metrics.precision_score(y_test,y_gb,pos_label="four")

0.73170731707317072

In [29]:
metrics.recall_score(y_test,y_gb,pos_label="four")

0.76923076923076927

In [30]:
metrics.f1_score(y_test,y_gb,pos_label="four")

0.74999999999999989

# 이산변수 / 범주형 변수 
 - dummy Coding : One hot encoding 
 - $Ex)$ 연로 : 가솔린 - 0, 디젤 - 1 , a b c => a : [1, 0, 0] b: [0,1,0] c: [ 0,0,1 ]

In [32]:
discrete = []
for v in automobile.columns:
    if automobile[v].dtype == object and v != 'doors':
        discrete.append(v)

In [33]:
discrete

['maker',
 'fuel',
 'aspiration',
 'body',
 'wheels',
 'engine_location',
 'engine_type',
 'cylinders',
 'fuel_system']

 - 명목형 변수 

In [34]:
automobile[discrete].head()

Unnamed: 0,maker,fuel,aspiration,body,wheels,engine_location,engine_type,cylinders,fuel_system
0,audi,gas,std,sedan,fwd,front,ohc,four,mpfi
1,audi,gas,std,sedan,4wd,front,ohc,five,mpfi
2,audi,gas,std,sedan,fwd,front,ohc,five,mpfi
3,audi,gas,turbo,sedan,fwd,front,ohc,five,mpfi
4,bmw,gas,std,sedan,rwd,front,ohc,four,mpfi


In [36]:
dummy = pandas.get_dummies(automobile[discrete])

In [37]:
dummy.head()

Unnamed: 0,maker_audi,maker_bmw,maker_chevrolet,maker_dodge,maker_honda,maker_jaguar,maker_mazda,maker_mercedes-benz,maker_mitsubishi,maker_nissan,...,cylinders_five,cylinders_four,cylinders_six,cylinders_three,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


 - X : 이산형 변수만 포함한 Data

In [39]:
X_all = pandas.concat([X,dummy], axis=1)

In [40]:
X_all.head(3)

Unnamed: 0,bore,city_mpg,compression_ratio,curb_weight,engine_size,horsepower,peak_rpm,city_mpg.1,price,maker_audi,...,cylinders_five,cylinders_four,cylinders_six,cylinders_three,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi
0,3.19,24,10.0,2337,109,102,5500,24,13950,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3.19,18,8.0,2824,136,115,5500,18,17450,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.19,19,8.5,2844,136,110,5500,19,17710,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [41]:
X_all_train, X_all_test, y_train, y_test = train_test_split(X_all, y, test_size=0.4, random_state=0)

## 모든 변수로 훈련 

In [42]:
from sklearn.svm import SVC

# SVC

In [43]:
model = SVC(kernel='rbf')
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, pos_label='four'))
print(metrics.recall_score(y_test, y_pred, pos_label='four'))
print(metrics.f1_score(y_test, y_pred, pos_label='four'))

[[39  0]
 [24  1]]
0.625
0.619047619048
1.0
0.764705882353


#  DecisionTreeClassifier

In [44]:
model = DecisionTreeClassifier(random_state=0)
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, pos_label='four'))
print(metrics.recall_score(y_test, y_pred, pos_label='four'))
print(metrics.f1_score(y_test, y_pred, pos_label='four'))

[[31  8]
 [10 15]]
0.71875
0.756097560976
0.794871794872
0.775


# RandomForestClassifier

In [45]:
model = RandomForestClassifier(random_state=0)
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, pos_label='four'))
print(metrics.recall_score(y_test, y_pred, pos_label='four'))
print(metrics.f1_score(y_test, y_pred, pos_label='four'))

[[34  5]
 [ 8 17]]
0.796875
0.809523809524
0.871794871795
0.83950617284


# GradientBoostingClassifier

In [46]:
model = GradientBoostingClassifier(random_state=0)
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, pos_label='four'))
print(metrics.recall_score(y_test, y_pred, pos_label='four'))
print(metrics.f1_score(y_test, y_pred, pos_label='four'))

[[33  6]
 [ 6 19]]
0.8125
0.846153846154
0.846153846154
0.846153846154
