API  : https://pypi.org/project/pytorch-tabnet/<br>
document : https://dreamquark-ai.github.io/tabnet/

In [None]:
!pip install pytorch_tabnet

In [1]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [2]:
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from warnings import filterwarnings
filterwarnings('ignore')

In [6]:
# 새롭게 불러들임
X_train = pd.read_csv('./X_train.csv')
X_val = pd.read_csv('./X_test.csv')
y_train = pd.read_csv('./y_train.csv')
y_val = pd.read_csv('./y_test.csv')

X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])
print('shape : ', X.shape, y.shape)
print('type : ', type(X), type(y))

shape :  (283, 76) (283, 1)
type :  <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [8]:
# 원래 결측치를 -1로 채웠었는데, 이를 np.nan으로 바꿔줌
X = X.applymap(lambda x: np.nan if x == -1 else x)

In [17]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import class_weight

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

y_test_all = np.array([])
y_test_proba_all = np.empty((0, 4))
for train_idx, test_idx in tqdm(cv.split(X, y)):
    # train-test split
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]
    classes_weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)

    # imputation
    X_train_mean = X_train.mean()
    X_train = X_train.fillna(X_train_mean)
    X_test = X_test.fillna(X_train_mean)

    # over sampling <- tabnet은 데이터셋이 너무 작으면 동작하지 않음 
    # (경험적으로 1000~1500개 이상)
    # tabnet API에는 pandas DataFrame이 아닌 numpy array를 넣어야 함
    X_train_repeat = np.repeat(X_train.values, 20, axis=0)
    y_train_repeat = np.repeat(y_train.values, 20, axis=0)

    # train
    # early stop, 
    clf = TabNetClassifier(device_name = 'cuda:0', verbose=0)
    clf.weight_updater(weights=classes_weights)
    clf.fit(X_train_repeat, y_train_repeat.ravel())

    # predict
    y_proba = clf.predict_proba(X_test.values)

    y_test_all = np.concatenate([y_test_all, y_test.values.ravel()], axis=None)
    y_test_proba_all = np.concatenate([y_test_proba_all, y_proba], axis=0)

0it [00:00, ?it/s]

In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
print('Precision : ',precision_score(y_test_all, y_test_proba_all.argmax(axis=1), average='weighted'))
print('Recall : ',recall_score(y_test_all, y_test_proba_all.argmax(axis=1), average='weighted'))
print('F1 score : ',f1_score(y_test_all, y_test_proba_all.argmax(axis=1), average='weighted'))
print('auc_ovr : ',roc_auc_score(y_test_all, y_test_proba_all, multi_class='ovr', average='weighted'))
print('acc : ',accuracy_score(y_test_all, y_test_proba_all.argmax(axis=1)))


Precision :  0.8042787178978943
Recall :  0.8091872791519434
F1 score :  0.8010000992333148
auc_ovr :  0.8817006727158294
acc :  0.8091872791519434
