In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, RocCurveDisplay
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import BayesSearchCV
from imblearn.pipeline import Pipeline

In [4]:
df_clean = pd.read_csv('cleaned_data.csv')

In [5]:
df_clean.drop(columns=['Unnamed: 0', 'year'], inplace=True)

In [6]:
df_clean['ADDEPEV3'] = df_clean['ADDEPEV3'].replace(2.0, 0.0)

In [7]:
X = df_clean.drop('ADDEPEV3', axis=1)
y = df_clean['ADDEPEV3']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# pre-trained 

In [9]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [25]:
X_train_np = X_train.values.astype(np.float32)
X_test_np = X_test.values.astype(np.float32)


cat_idxs = []
cat_dims = []

for i, col in enumerate(X_train.columns):
    n_unique = X_train[col].nunique() - 1
    
    # Threshold bisa kamu sesuaikan, misalnya <= 10 dianggap kategorikal
    if n_unique <= 10:  
        cat_idxs.append(i)
        cat_dims.append(n_unique)
        
print("Categorical indices:", cat_idxs)
print("Categorical dimensions:", cat_dims)


Categorical indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
Categorical dimensions: [1, 4, 3, 2, 5, 5, 1, 3, 4, 4, 1, 6, 1, 3, 1, 1, 6, 1, 1, 4, 3, 1, 5, 5, 1, 3]


In [27]:
cat_dims = []

for i in cat_idxs:
    col = X_train.columns[i]
    max_val = X_train[col].max()
    cat_dims.append(int(max_val) + 1)

In [36]:
clf_model = TabNetClassifier(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=3,
    n_d=32, n_a=32, n_steps=5,
    optimizer_params=dict(lr=2e-2),
    mask_type='sparsemax'
)

clf_model.fit(
    X_train=X_train_np, y_train=y_train.values,
    eval_set=[(X_test_np, y_test.values)],
    eval_name=["val"],
    eval_metric=["auc", "accuracy"],
    max_epochs=50,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
)

preds = clf_model.predict(X_test_np)
print("Accuracy:", accuracy_score(y_test, preds))




epoch 0  | loss: 0.52681 | val_auc: 0.74495 | val_accuracy: 0.76371 |  0:00:39s
epoch 1  | loss: 0.47511 | val_auc: 0.77743 | val_accuracy: 0.78173 |  0:01:11s
epoch 2  | loss: 0.4651  | val_auc: 0.78674 | val_accuracy: 0.78554 |  0:01:45s
epoch 3  | loss: 0.46247 | val_auc: 0.78719 | val_accuracy: 0.78803 |  0:02:18s
epoch 4  | loss: 0.45711 | val_auc: 0.7948  | val_accuracy: 0.79241 |  0:02:50s
epoch 5  | loss: 0.45477 | val_auc: 0.79598 | val_accuracy: 0.79293 |  0:03:26s
epoch 6  | loss: 0.45265 | val_auc: 0.79771 | val_accuracy: 0.79388 |  0:03:54s
epoch 7  | loss: 0.45061 | val_auc: 0.79931 | val_accuracy: 0.79405 |  0:04:23s
epoch 8  | loss: 0.44788 | val_auc: 0.80077 | val_accuracy: 0.79601 |  0:05:07s
epoch 9  | loss: 0.44469 | val_auc: 0.8073  | val_accuracy: 0.7967  |  0:06:22s
epoch 10 | loss: 0.44042 | val_auc: 0.81018 | val_accuracy: 0.79826 |  0:07:37s
epoch 11 | loss: 0.43852 | val_auc: 0.81181 | val_accuracy: 0.79931 |  0:08:51s
epoch 12 | loss: 0.43945 | val_auc: 0.81



Accuracy: 0.8028507063604017


In [None]:
explain_matrix, masks = clf_model.explain(X_test_np)
print("Feature importance shape:", explain_matrix.shape)
print("Feature importance for first 5 samples:")

plt.figure(figsize=(10, 6))
sns.barplot(x=np.mean(explain_matrix, axis=0), y=X_train.columns)
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

Feature importance shape: (47497, 26)
Feature importance for first 5 samples:


In [34]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         0.0       0.83      0.93      0.88     35593
         1.0       0.67      0.42      0.52     11904

    accuracy                           0.80     47497
   macro avg       0.75      0.68      0.70     47497
weighted avg       0.79      0.80      0.79     47497

