In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [5]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### 1. 데이터 전처리

* Feature selection

In [6]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'deck']]
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


* 결측치 처리

In [7]:
# 결측치 확인
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64

In [8]:
# age 컬럼은 평균으로 대체

df.age.fillna(df.age.mean(), inplace=True)
df.age.isna().sum()

0

In [10]:
# embarked는 최빈값으로 대체

df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [11]:
df.embarked.fillna('S', inplace=True)
df.embarked.isna().sum()

0

In [13]:
# deck 컬럼은 삭제
df.drop(columns = ['deck'], inplace=True)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.000000,1,0,7.2500,S
1,1,1,female,38.000000,1,0,71.2833,C
2,1,3,female,26.000000,0,0,7.9250,S
3,1,1,female,35.000000,1,0,53.1000,S
4,0,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S
887,1,1,female,19.000000,0,0,30.0000,S
888,0,3,female,29.699118,1,2,23.4500,S
889,1,1,male,26.000000,0,0,30.0000,C


In [15]:
df.isna().sum().sum()

0

* 카테고리 값인 sex, embarked 컬럼은 숫자로 변환

In [16]:
# LabelEncoder로 변환
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [17]:
df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


### 2. Train/Test dataset으로 분리

In [38]:
# X와 y를 넘파이 배열로 (values 안 붙이면 DataFrame이 된다)

X = df.iloc[:, 1:].values
y = df.survived.values
X.shape, y.shape

((891, 7), (891,))

In [39]:
# y값의 분포
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [41]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([439, 273], dtype=int64))

In [42]:
print(342*439, 549*273)

150138 149877


### 3. RandomForest 모델로 학습

In [43]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2021)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2021,
 'verbose': 0,
 'warm_start': False}

In [44]:
# RFC로 학습
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=2021)

### 4. 모델 예측 및 평가

In [45]:
rfc.score(X_test, y_test)

0.8100558659217877

### 5. 3, 4 대신에 GridSearchCV 수행

In [49]:
params = {
    'max_depth' : [2, 4, 6, 8],
    'min_samples_split' : [2, 4, 6]
}

In [50]:
from sklearn.model_selection import GridSearchCV
grid_rf = GridSearchCV(rfc, param_grid = params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 6, 8],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [51]:
grid_rf.best_params_

{'max_depth': 8, 'min_samples_split': 4}

In [53]:
params = {
    'max_depth' : [7, 8, 9, 10],
    'min_samples_split' : [3, 4, 5]
}

grid_rf = GridSearchCV(rfc, param_grid = params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)
grid_rf.best_params_

{'max_depth': 8, 'min_samples_split': 4}

In [54]:
best_rf = grid_rf.best_estimator_
best_rf.score(X_test, y_test)

0.8100558659217877

### 6. 테스트 데이터에 적용

In [55]:
X_test[27], y_test[27]

(array([ 3.    ,  1.    , 32.    ,  0.    ,  0.    , 56.4958,  2.    ]), 1)

In [57]:
# grid_rf.predict(X_test[27])  -> 2차원 데이터넣어야 하는데 1차원 데이터 넣음

grid_rf.predict(X_test[27].reshape(1, -1))

array([0], dtype=int64)

In [59]:
grid_rf.predict(X_test[12].reshape(1, -1)), y_test[12]

(array([0], dtype=int64), 0)

In [60]:
X_test[12]

array([ 2. ,  1. , 16. ,  0. ,  0. , 10.5,  2. ])

### 7. 엉터리 분류기
- 여성이면 생존이라 예측, 그 외의 경우는 사망으로 예측

In [63]:
df.groupby('sex')['survived'].mean()

sex
0    0.742038
1    0.188908
Name: survived, dtype: float64

In [66]:
X_test.shape, y_test.shape, X.shape

((179, 7), (179,), (891, 7))

In [87]:
X_test.shape[0]

179

In [79]:
from sklearn.base import BaseEstimator

class MyClassifier(BaseEstimator):
    # fit(), predict() method만 재정의 (override)
    def fit(self, X, y):
        pass
    def predict(self, X):
        pred = np.zeros(X.shape[0]) # X_test의 179개의 값을 전부 0으로 대입
        for i in range(X.shape[0]):
            if X[i, 1] == 0:        # 여성이면 1로 바꾸겠다
                pred[i] = 1
        
        return pred

In [88]:
np.zeros(5)

array([0., 0., 0., 0., 0.])

In [80]:
my_clf = MyClassifier()
my_clf.fit(X_train, y_train)
pred_my = my_clf.predict(X_test)

In [81]:
X_test[:5, 1], pred_my[:5]

(array([0., 1., 0., 1., 0.]), array([1., 0., 1., 0., 1.]))

In [82]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_my)

0.7877094972067039

In [74]:
pred_rf = best_rf.predict(X_test)
accuracy_score(y_test, pred_rf)

0.8100558659217877

In [83]:
sdf = pd.DataFrame({'y_test' : y_test, 'RF':pred_rf, 'My':pred_my})
sdf.head()

Unnamed: 0,y_test,RF,My
0,1,0,1.0
1,1,0,0.0
2,1,1,1.0
3,0,0,0.0
4,0,1,1.0


- 모델의 성능을 평가할 때 무조건적으로 정확도를 사용하는 것은 지양

In [76]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score
confusion_matrix(y_test, pred_rf)

array([[98, 12],
       [22, 47]], dtype=int64)

In [84]:
confusion_matrix(y_test, pred_my)

array([[95, 15],
       [23, 46]], dtype=int64)

In [85]:
precision_score(y_test, pred_rf), recall_score(y_test, pred_rf)

(0.7966101694915254, 0.6811594202898551)

In [86]:
precision_score(y_test, pred_my), recall_score(y_test, pred_my)

(0.7540983606557377, 0.6666666666666666)

# 평가

### 1. MNIST 손글씨 - Is it seven?

In [89]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits

In [90]:
digits = load_digits()
digits.data.shape

(1797, 64)

In [93]:
digits.data[0]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [95]:
X = digits.data
y = (digits.target == 7).astype(int)   # 이 경우 True, False가 나온다
                                       # 그것을 정수로 변환
X.shape, y.shape

((1797, 64), (1797,))

In [100]:
np.unique(y, return_counts=True)

(array([0, 1]), array([1618,  179], dtype=int64))

In [101]:
y[:5]

array([0, 0, 0, 0, 0])

In [99]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1347, 64), (450, 64), (1347,), (450,))

* SVM으로 학습 및 평가

In [103]:
from sklearn.svm import SVC
svc = SVC(random_state=2021)
svc.fit(X_train, y_train)
pred_sv = svc.predict(X_test)
svc.score(X_test, y_test)

0.9977777777777778

- 엉터리 분류기로 학습/예측/평가

In [104]:
from sklearn.base import BaseEstimator

class MyClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self, X):     # 무조건 0을 반환
        return np.zeros(X.shape[0])

In [105]:
my_clf = MyClassifier()
my_clf.fit(X_train, y_train)    # 참고로 fit에서는 아무것도 안함
pred_my = my_clf.predict(X_test)

In [106]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_my)

0.9

In [107]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score
confusion_matrix(y_test, pred_sv)

array([[404,   1],
       [  0,  45]], dtype=int64)

In [109]:
precision_score(y_test, pred_sv), recall_score(y_test, pred_sv)

(0.9782608695652174, 1.0)

In [111]:
confusion_matrix(y_test, pred_my)

array([[405,   0],
       [ 45,   0]], dtype=int64)

In [110]:
precision_score(y_test, pred_my), recall_score(y_test, pred_my)

  _warn_prf(average, modifier, msg_start, len(result))


(0.0, 0.0)

In [112]:
from sklearn.metrics import f1_score
f1_score(y_test, pred_sv)

0.989010989010989