# Decision Tree

In [3]:
import pandas
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [2]:
automobile = pandas.read_csv('../1015_Numpy&Pandas/automobile.csv')
variables = ['bore', 'city_mpg', 'compression_ratio', 'curb_weight', 'engine_size',
             'horsepower', 'peak_rpm', 'city_mpg', 'price']
X = automobile[variables]
y = automobile['doors']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [4]:
tree = DecisionTreeClassifier()

 - max_features : 질문할 수 있는 갯수를 제한할 수 있다.
 - max_depth : 깊이제한.
 - max_leaf_nodes : leaf Node의 갯수를 제한할 수있다. 

In [5]:
tree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [8]:
y_tree = tree.predict(X_test)

## 모형 평가

In [10]:
metrics.confusion_matrix(y_test,y_tree)

array([[25, 14],
       [ 9, 16]])

In [11]:
metrics.accuracy_score(y_test, y_tree)

0.640625

In [12]:
metrics.precision_score(y_test, y_tree, pos_label='four')

0.73529411764705888

In [13]:
metrics.recall_score(y_test, y_tree, pos_label='four')

0.64102564102564108

In [14]:
metrics.f1_score(y_test, y_tree, pos_label='four')

0.68493150684931514

In [15]:
tree.feature_importances_

array([ 0.05872253,  0.03338675,  0.        ,  0.44516367,  0.12313238,
        0.        ,  0.04956155,  0.02899878,  0.26103434])

# 앙상블 (Ensemble)
## Random Forest (랜덤포레스트)
 - 하나의 모델일 경우 Under Fitting or Overfitting 가능성이 너무 높다 
 - 주식 시장의 포트폴리오와 비슷한 개념이다. ( 재태크의 기본 )

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rf = RandomForestClassifier(n_estimators=10, random_state=0) # n_estimators : Tree의 수 , random_state 수업시간에 결과가 같아야 하므로 고정.

In [18]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [19]:
y_rf = rf.predict(X_test)

# 모델평가

In [20]:
metrics.confusion_matrix(y_test,y_rf)

array([[29, 10],
       [11, 14]])

In [21]:
metrics.accuracy_score(y_test,y_rf)

0.671875

In [23]:
metrics.precision_score(y_test, y_rf, pos_label='four')

0.72499999999999998

In [27]:
metrics.recall_score(y_test, y_rf, pos_label='four')

0.74358974358974361

In [28]:
metrics.f1_score(y_test, y_rf, pos_label='four')

0.73417721518987333

# Gradient Boosting Tree
 - <a href="https://xgboost.readthedocs.io/en/latest/">XGBoost</a>가 가장 좋은 library
 - 설치가 어렵다 ( 해보자 )

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

In [30]:
gb = GradientBoostingClassifier(n_estimators=10, random_state=0)

In [31]:
gb.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=10, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)

In [32]:
y_gb = gb.predict(X_test)

## 모형 평가 

In [33]:
metrics.confusion_matrix(y_test,y_gb)

array([[30,  9],
       [11, 14]])

In [34]:
metrics.accuracy_score(y_test,y_gb)

0.6875

In [37]:
metrics.precision_score(y_test,y_gb,pos_label="four")

0.73170731707317072

In [38]:
metrics.recall_score(y_test,y_gb,pos_label="four")

0.76923076923076927

In [40]:
metrics.f1_score(y_test,y_gb,pos_label="four")

0.74999999999999989

# 이산변수 / 범주형 변수 
 - dummy Coding : One hot encoding 
 - $Ex)$ 연로 : 가솔린 - 0, 디젤 - 1 , a b c => a : [1, 0, 0] b: [0,1,0] c: [ 0,0,1 ]

In [41]:
discrete = []
for v in automobile.columns:
    if automobile[v].dtype == object and v != 'doors':
        discrete.append(v)

In [42]:
discrete

['maker',
 'fuel',
 'aspiration',
 'body',
 'wheels',
 'engine_location',
 'engine_type',
 'cylinders',
 'fuel_system']

In [43]:
dummy = pandas.get_dummies(automobile[discrete])

In [50]:
dummy.head()

Unnamed: 0,maker_audi,maker_bmw,maker_chevrolet,maker_dodge,maker_honda,maker_jaguar,maker_mazda,maker_mercedes-benz,maker_mitsubishi,maker_nissan,...,cylinders_five,cylinders_four,cylinders_six,cylinders_three,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [44]:
X_all = pandas.concat([X,dummy], axis=1)

In [45]:
X_all_train, X_all_test, y_train, y_test = train_test_split(X_all, y, test_size=0.4, random_state=0)

## 모든 변수로 훈련 

In [46]:
from sklearn.svm import SVC

# SVC

In [47]:
model = SVC(kernel='rbf')
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, pos_label='four'))
print(metrics.recall_score(y_test, y_pred, pos_label='four'))
print(metrics.f1_score(y_test, y_pred, pos_label='four'))

[[39  0]
 [24  1]]
0.625
0.619047619048
1.0
0.764705882353


#  DecisionTreeClassifier

In [48]:
model = DecisionTreeClassifier(random_state=0)
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, pos_label='four'))
print(metrics.recall_score(y_test, y_pred, pos_label='four'))
print(metrics.f1_score(y_test, y_pred, pos_label='four'))

[[31  8]
 [10 15]]
0.71875
0.756097560976
0.794871794872
0.775


# RandomForestClassifier

In [51]:
model = RandomForestClassifier(random_state=0)
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, pos_label='four'))
print(metrics.recall_score(y_test, y_pred, pos_label='four'))
print(metrics.f1_score(y_test, y_pred, pos_label='four'))

[[34  5]
 [ 8 17]]
0.796875
0.809523809524
0.871794871795
0.83950617284


# GradientBoostingClassifier

In [52]:
model = GradientBoostingClassifier(random_state=0)
model.fit(X_all_train, y_train)
y_pred = model.predict(X_all_test)

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, pos_label='four'))
print(metrics.recall_score(y_test, y_pred, pos_label='four'))
print(metrics.f1_score(y_test, y_pred, pos_label='four'))

[[33  6]
 [ 6 19]]
0.8125
0.846153846154
0.846153846154
0.846153846154
