## 간단한 다중 분류 예시

In [4]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [16]:
# 상세 데이터는 공개 X
# 4진 분류
import pandas as pd
X = pd.read_csv('X_train.csv')
y = pd.read_csv('y_train.csv')
y.columns = ['label']
print(y.iloc[:,0].value_counts())

# clf
clf = DecisionTreeClassifier(random_state=0)

label
3    122
1     27
2     22
0     18
Name: count, dtype: int64


In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

cv = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
cross_val_pred = cross_val_predict(clf, X, y, cv=cv)

print('Precision : ',precision_score(y, cross_val_pred, average='weighted'))
print('Recall : ',recall_score(y, cross_val_pred, average='weighted'))
print('F1 score : ',f1_score(y, cross_val_pred, average='weighted'))
print('acc : ',accuracy_score(y, cross_val_pred))

Precision :  0.8000222881175263
Recall :  0.7936507936507936
F1 score :  0.7960933884164737
acc :  0.7936507936507936


문제 없이 출력되는 것을 확인할 수 있다. 이제 AUROC 계산을 위해 probability를 출력하려하지만 index 문제로 출력되지 않는다.<br>
(사실 cross_val_score로 roc_auc_ovr을 바로 계산할 수 있겠지만 여기서는 확률값 추출에 신경 쓰기로 함)

In [20]:
from sklearn.metrics import roc_auc_score
try:
    cross_val_pred = cross_val_predict(clf, X, y, cv=cv, method='predict_proba')
    print('AUC : ',roc_auc_score(y, cross_val_pred, multi_class='ovr'))
except IndexError as e:
    print('Error')
    print(e)

Error
index 1 is out of bounds for axis 1 with size 1




다음과 같은 custom classifier를 만들어 해결

In [25]:
from sklearn.base import BaseEstimator, ClassifierMixin

class temp(BaseEstimator, ClassifierMixin):
    def __init__(self,clf):
        self.clf = clf
    def fit(self, X, y):
        return self.clf.fit(X, y)
    def predict(self, X):
        return self.clf.predict_proba(X)
cross_val_proba = cross_val_predict(temp(clf), X, y, cv=cv)

print('Precision : ',precision_score(y, cross_val_proba.argmax(axis=1), average='weighted'))
print('Recall : ',recall_score(y, cross_val_proba.argmax(axis=1), average='weighted'))
print('F1 score : ',f1_score(y, cross_val_proba.argmax(axis=1), average='weighted'))
print('AUC : ',roc_auc_score(y, cross_val_proba, multi_class='ovr'))
print('acc : ',accuracy_score(y, cross_val_proba.argmax(axis=1)))

Precision :  0.8000222881175263
Recall :  0.7936507936507936
F1 score :  0.7960933884164737
AUC :  0.8262029888362413
acc :  0.7936507936507936


AUC 외 나머지 metric들은 위에서 계산한 것과 동일함을 확인할 수 있음

조금 더 복잡한 pipeline도 해결 가능

In [27]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import Pipeline, make_pipeline
import xgboost as xgb

SMOTETomek_pipeline = make_pipeline(SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'), random_state=0),
                                    xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0))
cross_val_proba = cross_val_predict(temp(SMOTETomek_pipeline), X, y, cv=cv)

print('Precision : ',precision_score(y, cross_val_proba.argmax(axis=1), average='weighted'))
print('Recall : ',recall_score(y, cross_val_proba.argmax(axis=1), average='weighted'))
print('F1 score : ',f1_score(y, cross_val_proba.argmax(axis=1), average='weighted'))
print('AUC : ',roc_auc_score(y, cross_val_proba, multi_class='ovr'))
print('acc : ',accuracy_score(y, cross_val_proba.argmax(axis=1)))

Precision :  0.8631411347675263
Recall :  0.8624338624338624
F1 score :  0.8603823285730638
AUC :  0.9548796202481853
acc :  0.8624338624338624
