구해야 하는 요소
- accuracy
- precision
- recall
- f1

In [75]:
import numpy as np
import pandas as pd

In [76]:
df = pd.read_csv('ThoraricSurgery.csv', sep=',', header=None)
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,293,1,3.8,2.8,0,0,0,0,0,0,12,0,0,0,1,0,62,0
1,1,2,2.88,2.16,1,0,0,0,1,1,14,0,0,0,1,0,60,0
2,8,2,3.19,2.5,1,0,0,0,1,0,11,0,0,1,1,0,66,1


In [77]:
df.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
dtype: int64

In [78]:
X = df.iloc[:,:-1].values
y = df[17].values
X.shape, y.shape

((470, 17), (470,))

In [79]:
df[17].value_counts()

0    400
1     70
Name: 17, dtype: int64

In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((376, 17), (94, 17), (376,), (94,))

In [81]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2021)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2021,
 'verbose': 0,
 'warm_start': False}

In [82]:
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=2021)

In [83]:
rfc.score(X_test, y_test)

0.851063829787234

In [84]:
params = {
    'max_depth': [3,4,5,6,7], 
    'min_samples_split': [2,3,4]
}

In [85]:
from sklearn.model_selection import GridSearchCV

grid_rf = GridSearchCV(rfc, param_grid=params, scoring='accuracy', cv=3)
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=2021),
             param_grid={'max_depth': [3, 4, 5, 6, 7],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [86]:
grid_rf.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [87]:
params = {
    'max_depth': [2,3,4], 
    'min_samples_split': [2,3,4]
}

In [88]:
grid_rf = GridSearchCV(rfc, param_grid=params, scoring='accuracy', cv=3)
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [89]:
grid_rf.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [90]:
grid_rf.score(X_test, y_test)

0.851063829787234

In [91]:
y_test[80], X_test[80]

(0,
 array([106.  ,   5.  ,   4.95,   4.12,   1.  ,   0.  ,   0.  ,   0.  ,
          0.  ,   1.  ,  11.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
         57.  ]))

In [92]:
grid_rf.predict(X_test[80].reshape(1,17))
# 바로 위의 코드를 보면 현재 X_test는 1차원이다. 
# 하지만 predict 인수는 2차원인 X_test이다.
# 따라서 1차원에서 2차원으로 만들어주기 위해 reshape을 해주어야 함
# 1차원 [2,3,4,5] (4,) 인데 이것을 [[2,3,4,5]] (1행,4열)로 만들어주는 것

array([0], dtype=int64)

In [93]:
from sklearn.base import BaseEstimator

class MyFakeClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self, X):
        return np.zeros(X.shape[0], dtype=int)

In [94]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([320,  56], dtype=int64))

In [95]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
dtc.fit(X_train, y_train)
pred_dt = dtc.predict(X_test)

In [96]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_dt)

0.6808510638297872

In [97]:
myc = MyFakeClassifier()
myc.fit(X_train, y_train)
pred_my = myc.predict(X_test)

In [98]:
accuracy_score(y_test, pred_my)

0.851063829787234

In [99]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_dt)

array([[61, 19],
       [11,  3]], dtype=int64)

In [100]:
confusion_matrix(y_test, pred_my)

array([[80,  0],
       [14,  0]], dtype=int64)

In [101]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, pred_dt), recall_score(y_test, pred_dt)

(0.13636363636363635, 0.21428571428571427)

In [102]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, pred_dt)
print('F1 스코어: {0:.4f}'.format(f1))

F1 스코어: 0.1667
