# Tabular Transformer benchmark on Medical data


In [20]:
import yaml
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from tabpfn import TabPFNClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, balanced_accuracy_score
from sklearn.utils import shuffle
from collections import Counter

def get_dataset_info(name: str):
    for ds in config["datasets"]:
        if ds["name"] == name:
            return ds
    raise ValueError(f"Dataset '{name}' not found.")

def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
    return {
        "model": model_name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "BalancedAcc": balanced_accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
    }

In [13]:
def group_classes(y_list, map_classes):
    new_y_list = []
    for x in y_list:
        if x in map_classes.keys():
            new_y_list.append(map_classes[x])
        else:
            new_y_list.append(x)
    return new_y_list

In [27]:
# loading yaml file
with open("/Users/giovannanicora/PycharmProjects/tabular-fm-medical-benchmark/data/dataset_info.yaml", "r") as f:
    config = yaml.safe_load(f)
config



{'datasets': [{'name': 'myocardial_infarction',
   'path': '/Users/giovannanicora/Documents/progetti_in_corso/tabular_fm_medical_benchmark/dataset/myocardial_infarction_complications.csv',
   'target': 'class',
   'group_class': '1,2,3,4,5,6,7',
   'notes': 'UCI Machine Learning repo dataset'},
  {'name': 'cdc_diabetes',
   'path': '/Users/giovannanicora/Documents/progetti_in_corso/tabular_fm_medical_benchmark/dataset/CDC_diabetes.csv',
   'target': 'class',
   'notes': 'UCI Machine Learning repo dataset'}]}

In [28]:
# select a dataset - for instance, myocardial infarction
dataset_name = "cdc_diabetes"
dataset_info = get_dataset_info(dataset_name)
dataset_info


{'name': 'cdc_diabetes',
 'path': '/Users/giovannanicora/Documents/progetti_in_corso/tabular_fm_medical_benchmark/dataset/CDC_diabetes.csv',
 'target': 'class',
 'notes': 'UCI Machine Learning repo dataset'}

In [29]:
# read the dataframe
df = pd.read_csv(dataset_info['path'])
y = df[dataset_info['target']].tolist()

# removing the target variable
df = df.drop(columns=dataset_info['target'])

# for some multiclass problems, we convert it to binary
if 'group_class' in dataset_info.keys():
    # grouping class
    map_dict = {}
    for n in dataset_info['group_class'].split(','):
        n_int = int(n.strip())
        map_dict[n_int] = 1
    
    y = group_classes(y, map_dict)

# some statistics
print("Number of patients:", df.shape[0])
print("Number of features:", df.shape[1])
print("Class #", Counter(y))
print("Class 1 %, ", Counter(y)[1]/len(y))

Number of patients: 253680
Number of features: 22
Class # Counter({0: 218334, 1: 35346})
Class 1 %,  0.13933301797540207


In [30]:
# check
len(y) == df.shape[0]

True

## Preprocessing

## 5-fold cross validation

In [31]:
rf = RandomForestClassifier(random_state=42, class_weight="balanced")
rf_grid = {
    "clf__n_estimators": [100, 300],
    "clf__max_depth": [None, 5, 10],
    "clf__min_samples_leaf": [1, 3, 5],
}

lasso = LogisticRegression(
    solver="saga",
    penalty="l1",
    max_iter=10000,
    random_state=42,
    class_weight="balanced"
)

lasso_grid = {
    "clf__C": np.logspace(-3, 1, 5),  # inverse of regularization strength
}

results = []

n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)


np.random.seed(1)

# we will evaluate at an increasing number of training samples
n_context = 50
i_context = np.random.choice(df.shape[0], size=n_context, replace=False)

df_subset = df.iloc[i_context, :]
y_subset = np.array(y)[i_context]

for i, (train_index, test_index) in enumerate(kf.split(df_subset, y_subset)):

    X_train = df_subset.iloc[train_index, :]
    y_train = np.array(y_subset)[train_index]


    X_test = df_subset.iloc[test_index, :]
    y_test = np.array(y_subset)[test_index]

    # preprocessing
    # 1 - imputation with the most frequent value
    imp_mode = SimpleImputer(strategy="most_frequent")
    X_train = imp_mode.fit_transform(X_train)
    X_test = imp_mode.transform(X_test)

    # 2 - normalization (for LR)
    # scaler = MinMaxScaler()
    # X_train_norm = scaler.fit_transform(X_train)
    # X_test_norm = scaler.transform(X_test)
    
    print("#######")
    print(i)
    print("Train size:", X_train.shape[0], "- Test size:", X_test.shape[0])


    # training
    inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    # ---- Random Forest ----
    rf_pipe = Pipeline([("clf", rf)])
    rf_search = GridSearchCV(
        rf_pipe, rf_grid, cv=inner_cv,
        scoring="balanced_accuracy", n_jobs=-1
    )
    rf_search.fit(X_train, y_train)
    results.append(evaluate_model(rf_search.best_estimator_, X_test, y_test, "RandomForest"))

    # ---- LASSO Logistic Regression ----
    lasso_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", lasso)
    ])
    lasso_search = GridSearchCV(
        lasso_pipe, lasso_grid, cv=inner_cv,
        scoring="balanced_accuracy", n_jobs=-1
    )
    lasso_search.fit(X_train, y_train)
    results.append(evaluate_model(lasso_search.best_estimator_, X_test, y_test, "LASSO_Logistic"))



    # 3-TabPFN (no tuning)
    tabpfn_class = TabPFNClassifier()
    tabpfn_class.fit(X_train, y_train)
    results.append(evaluate_model(tabpfn_class, X_test, y_test, "TabPFN_raw"))


    # 4-TabPFN but with preprocessing


#######
0
Train size: 40 - Test size: 10
#######
1
Train size: 40 - Test size: 10
#######
2
Train size: 40 - Test size: 10
#######
3
Train size: 40 - Test size: 10
#######
4
Train size: 40 - Test size: 10


In [32]:
results

[{'model': 'RandomForest',
  'Accuracy': 0.8,
  'BalancedAcc': 0.8888888888888888,
  'F1': 0.5,
  'AUC': 1.0},
 {'model': 'LASSO_Logistic',
  'Accuracy': 0.7,
  'BalancedAcc': 0.3888888888888889,
  'F1': 0.0,
  'AUC': 0.6666666666666667},
 {'model': 'TabPFN_raw',
  'Accuracy': 0.6,
  'BalancedAcc': 0.7777777777777778,
  'F1': 0.3333333333333333,
  'AUC': 0.6666666666666667},
 {'model': 'RandomForest',
  'Accuracy': 0.9,
  'BalancedAcc': 0.75,
  'F1': 0.6666666666666666,
  'AUC': 1.0},
 {'model': 'LASSO_Logistic',
  'Accuracy': 0.9,
  'BalancedAcc': 0.75,
  'F1': 0.6666666666666666,
  'AUC': 0.9375},
 {'model': 'TabPFN_raw',
  'Accuracy': 0.5,
  'BalancedAcc': 0.5,
  'F1': 0.2857142857142857,
  'AUC': 0.25},
 {'model': 'RandomForest',
  'Accuracy': 0.8,
  'BalancedAcc': 0.6875,
  'F1': 0.5,
  'AUC': 0.5625},
 {'model': 'LASSO_Logistic',
  'Accuracy': 0.8,
  'BalancedAcc': 0.6875,
  'F1': 0.5,
  'AUC': 0.625},
 {'model': 'TabPFN_raw',
  'Accuracy': 0.7,
  'BalancedAcc': 0.8125,
  'F1': 0

In [33]:
results_df = pd.DataFrame(results)
summary = results_df.groupby("model").agg(["mean", "std"]).round(3)
print(summary)

               Accuracy        BalancedAcc            F1           AUC       
                   mean    std        mean    std   mean    std   mean    std
model                                                                        
LASSO_Logistic     0.74  0.114       0.565  0.173  0.313  0.301  0.658  0.180
RandomForest       0.74  0.152       0.615  0.235  0.333  0.312  0.750  0.276
TabPFN_raw         0.52  0.148       0.556  0.253  0.295  0.203  0.408  0.307
