In [3]:
import os

# Setup Imports
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    mean_absolute_error,
    mean_squared_error,
    root_mean_squared_error,
    r2_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, Markdown, Latex

# Baseline Imports
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import torch

from tabpfn import TabPFNClassifier, TabPFNRegressor
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNClassifier, AutoTabPFNRegressor

if not torch.mps.is_available():
    raise SystemError('GPU device not found. For fast training, please enable GPU. See section above for instructions.')

In [4]:
df = pd.read_csv(r'/Users/jannispoltier/Documents/Studium/3. Semester/tabpfn_credit_codebase/data/pd/02 taiwan creditcard/taiwan_creditcard.csv')

In [5]:
def _preprocess_02_taiwan_creditcard(_data):

    # Drop ID and useless columns
    _data = _data.drop('ID', axis=1)

    # Transform
    _data['SEX'] = _data['SEX'].replace({'2': 1, '1': 0})

    # Split into covariates, labels
    y = _data['default.payment.next.month'].values.astype(int)
    x = _data.drop('default.payment.next.month', axis=1).values

    cols = list(_data.drop('default.payment.next.month', axis=1).columns)

    cols_cat = []
    cols_num = cols

    cols_cat_idx = [cols.index(col) for col in cols_cat if col in cols]
    cols_num_idx = [cols.index(col) for col in cols_num if col in cols]

    print("02_taiwan_creditcard preprocessed")
    print("x shape: ", x.shape)
    print("y shape: ", y.shape)

    return x, y, cols, cols_cat, cols_num, cols_cat_idx, cols_num_idx

X, y, cols, cols_cat, cols_num, cols_cat_idx, cols_num_idx = _preprocess_02_taiwan_creditcard(df)

02_taiwan_creditcard preprocessed
x shape:  (30000, 23)
y shape:  (30000,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
from tabpfn_extensions.rf_pfn import (
    RandomForestTabPFNClassifier,
    RandomForestTabPFNRegressor,
)

from tabpfn_extensions import TabPFNClassifier, TabPFNRegressor

# ----------------------------
# Classification - Strategy 1: Random Forest Preprocessing
# ----------------------------

clf_base = TabPFNClassifier(
    ignore_pretraining_limits=True,
    inference_config = {"SUBSAMPLE_SAMPLES": 10000} # Needs to be set low so that not OOM on fitting intermediate nodes
)

tabpfn_tree_clf = RandomForestTabPFNClassifier(
    tabpfn=clf_base,
    verbose=1,
    max_predict_time=60, # Will fit for one minute
    fit_nodes=True, # Wheather or not to fit intermediate nodes
    adaptive_tree=True, # Whather or not to validate if adding a leaf helps or not
  )

In [8]:
# ----------------------------
# Classification - Strategy 2: Subsampled Ensemble using TabPFNClassifier
# ----------------------------
print("\n--- Classification: Strategy 1 (Subsampled Ensemble) ---")
tabpfn_subsample_clf = TabPFNClassifier(
    ignore_pretraining_limits=True,  # (bool) Allows the use of datasets larger than pretraining limits.
    n_estimators=32,                 # (int) Number of estimators for ensembling; improves accuracy with higher values.
    inference_config={
        "SUBSAMPLE_SAMPLES": 10000  # (int) Maximum number of samples per inference step to manage memory usage.
    },
)


--- Classification: Strategy 1 (Subsampled Ensemble) ---


In [9]:
# Compare different machine learning models by training each one multiple times
# on different parts of the data and averaging their performance scores for a
# more reliable performance estimate

assert len(np.unique(y)) <= 10 # Is classification?

# Define models
models_class = [
    ('TabPFN RF', tabpfn_tree_clf),
    ('TabPFN Subsample', tabpfn_subsample_clf),
    ('XGBoost', XGBClassifier()),
    ('CatBoost', CatBoostClassifier(random_state=42, verbose=0)),
    ('RandomForest', RandomForestClassifier(random_state=42)),
]

# Calculate scores
cv = KFold(random_state=42, n_splits=3, shuffle=True)
scoring = 'roc_auc_ovr' if len(np.unique(y)) > 2 else 'roc_auc'
scores_raw_class = {name: cross_val_score(model, X, y, cv=cv, scoring=scoring, verbose=1)
          for name, model in models_class}
scores_class = {name: scores_raw_class[name].mean()
          for name, model in models_class}

  X, y, feature_names_in, n_features_in = validate_Xy_fit(


KeyboardInterrupt: 

In [11]:
scoring

'roc_auc'

In [10]:
scores_raw_class

NameError: name 'scores_raw_class' is not defined

In [None]:
# Plot results
df = pd.DataFrame(list(scores_class.items()), columns=['Model', 'ROC AUC'])
colors = ['tab:blue' if 'RF' in name else ('tab:red' if 'sample' in name else 'tab:gray') for (name, _) in models_class]
ax = df.plot(x='Model', y='ROC AUC', kind='bar', figsize=(10, 6), color=colors)
ax.set_ylim(df['ROC AUC'].min() * 0.995, min(1.0, df['ROC AUC'].max() * 1.005))
ax.set_title('Model Comparison - 5-fold Cross-validation')