In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
X = pd.read_csv("../data/processed/X_all.csv")
y = pd.read_csv("../data/processed/y_all.csv")["TARGET"]
print(f"Shape: X = {X.shape}, y = {y.shape}")


Shape: X = (307507, 251), y = (307507,)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")


Train size: (230630, 251), Test size: (76877, 251)


In [4]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Encode target if needed
if y_train.dtype == 'object' or y_train.dtype.name == 'category':
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

# Convert to NumPy
X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train
y_test_np = y_test


In [5]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score
import torch

tabnet = TabNetClassifier(
    
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":4, "gamma":0.4},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='entmax'  # 'sparsemax' also works
)

tabnet.fit(
    X_train=X_train_np, y_train=y_train_np,
    eval_set=[(X_test_np, y_test_np)],
    eval_name=['valid'],
    eval_metric=['auc'],
    max_epochs=10,
    patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=1,
    drop_last=False
)




epoch 0  | loss: 0.29333 | valid_auc: 0.70966 |  0:00:32s
epoch 1  | loss: 0.25829 | valid_auc: 0.73103 |  0:01:03s
epoch 2  | loss: 0.25519 | valid_auc: 0.73821 |  0:01:34s
epoch 3  | loss: 0.25348 | valid_auc: 0.74273 |  0:02:06s
epoch 4  | loss: 0.25151 | valid_auc: 0.74499 |  0:02:38s
epoch 5  | loss: 0.25083 | valid_auc: 0.74797 |  0:03:11s
epoch 6  | loss: 0.24983 | valid_auc: 0.74966 |  0:03:45s
epoch 7  | loss: 0.24931 | valid_auc: 0.74994 |  0:04:20s
epoch 8  | loss: 0.24828 | valid_auc: 0.75193 |  0:04:59s
epoch 9  | loss: 0.24799 | valid_auc: 0.75195 |  0:05:38s
Stop training because you reached max_epochs = 10 with best_epoch = 9 and best_valid_auc = 0.75195




TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object

In [None]:
# Predict probabilities
preds = tabnet.predict_proba(X_test_np)[:, 1]
auc = roc_auc_score(y_test_np, preds)
print(f"TabNet AUC: {auc:.4f}")
