In [1]:
import numpy as np
from sklearn.base import BaseEstimator

class MyDummyClassifier(BaseEstimator):
    # fit 메소드는 아무것도 학습하지 않음
    def fit(self, X, y=None):
        pass
    
    # predict 메소드는 성별만 고려하여 생존여부를 예측함
    def predict(self, X):
        pred = np.zeros((X.shape[0], 1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 1:
                pred[i] = 0
            else:
                pred[i] = 1
        return pred
    

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from handling_titanic import transform_features

titanic_df = pd.read_csv('../data/titanic/train.csv')
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']

X = transform_features(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

myclf = MyDummyClassifier()
myclf.fit(X_train, y_train)

pred_my = myclf.predict(X_test)
round(accuracy_score(y_test, pred_my), 4)

0.7877

- 성별만을 고려한 분류기가 78.77% 정확도를 보임

In [5]:
from sklearn.datasets import load_digits

# create dummy classifier
class MyFakeClassifier(BaseEstimator):

    def fit(self, X, y=None):
        pass

    def predict(self, X):
        return np.zeros((X.shape[0], 1), dtype=bool)
    
digits = load_digits()

y = (digits['target'] == 7).astype(int)
X = digits['data']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11)

In [6]:
fake_clf = MyFakeClassifier()
fake_clf.fit(X_train, y_train)

pred_fake = fake_clf.predict(X_test)

accuracy_score(y_test, pred_fake)

0.9

- MNIST 데이터를 7인가 아닌가를 분류하는 이진문제로 만들어서 예측결과를 모두 0으로 하는 분류기가 90% 정확도를 보임

- 정확도를 평가 지표로 활용하는 것이 성능을 정확히 반영하지 못할 가능성을 고려해야 함

In [7]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, pred_fake)

array([[405,   0],
       [ 45,   0]], dtype=int64)