In [None]:
# Installing TabPFN
!pip install tabpfn 

In [None]:
!pip install tabpfn-extensions

## TabPFN

Machine learning without training model on your own. In modeling we will use pre-trained tabular model - TabPFN. As an implementation of that model we will use its official implementation in package `tabpfn` with weights provided by authors of the original paper. In this section we will:
* use TabPFN as a regressor and classifier,
* show basic explanations from `tabpfn` package,
* scale the model beyond 10000 observations which is a limit from original implementation.

## Basic

In [None]:
import numpy as np

from sklearn.datasets import load_breast_cancer, fetch_openml
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier, TabPFNRegressor
from tabpfn_extensions import interpretability
from tabpfn_extensions.rf_pfn import (
    RandomForestTabPFNClassifier,
    RandomForestTabPFNRegressor,
)
from sklearn.preprocessing import LabelEncoder

In [None]:
# Download dataset
X, y = load_breast_cancer(return_X_y=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

# Create and fit the model
clf = TabPFNClassifier() # scikit-learn interface compatible
clf.fit(X_train, y_train)

# Evaluate
prediction_probabilities = clf.predict_proba(X_test)
print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities[:, 1]))

In [None]:
# Download dataset
df = fetch_openml(data_id=531, as_frame=True)
X = df.data
y = df.target.astype(float)  # Ensure target is float for regression

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=1)

# Create and fit the model
regressor = TabPFNRegressor()
regressor.fit(X_train, y_train)

# Evaluate
predictions = regressor.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print("Mean Squared Error (MSE):", mse)
print("RÂ² Score:", r2)

## Interpretability

In [None]:
# Download data
data = load_breast_cancer()
X, y = data.data, data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

# Fit the model
clf = TabPFNClassifier()
clf.fit(X_train, y_train)

# Generate SHAP values
n_samples = 1
feature_names = data.feature_names

shap_values = interpretability.shap.get_shap_values(
    estimator=clf,
    test_x=X_test[:n_samples],
    attribute_names=feature_names,
    algorithm="permutation",
)

# Create visualization
fig = interpretability.shap.plot_shap(shap_values)

## Random Forest TabPFN - an approach to scale TabPFN for bigger datasets

In [None]:
# Download dataset
df = fetch_openml(
    "electricity",
    version=1,
    as_frame=True,
)
X, y = df.data, df.target

# Encode labels to numbers
le = LabelEncoder()
y = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
X_train.shape # > 10_000 !

In [None]:
# Fitting with too big dataset leads to an error
clf = TabPFNClassifier()
clf.fit(X_train, y_train) # ERROR !

In [None]:
# Simple sampling of data -> suboptimal
# Create model with sampling
clf = TabPFNClassifier(
    ignore_pretraining_limits=True,
    inference_config={
        "SUBSAMPLE_SAMPLES": 10000,
    },
    random_state=1,
)
# Fit the model
clf.fit(X_train, y_train)
# Evaluate
prediction_probabilities = clf.predict_proba(X_test)
predictions = np.argmax(prediction_probabilities, axis=1)
print(f"ROC AUC: {roc_auc_score(y_test, prediction_probabilities[:, 1]):.4f}")
print(f"Accuracy: {accuracy_score(y_test, predictions):.4f}")

In [None]:
# Random forest where in each leaf we use TabPFN
# Create base estimator
base = TabPFNClassifier(
    ignore_pretraining_limits=True,
    inference_config={
        "SUBSAMPLE_SAMPLES": 10000,
    },
    random_state=1,
)
# Create Random Forest model using base estimator
tabpfn_tree = RandomForestTabPFNClassifier(
    tabpfn=base,
    verbose=1,
    random_state=1,
)
# Fit the model
tabpfn_tree.fit(X_train, y_train)
# Evaluate
prediction_probabilities = tabpfn_tree.predict_proba(X_test)
predictions = np.argmax(prediction_probabilities, axis=1)
print(f"ROC AUC: {roc_auc_score(y_test, prediction_probabilities[:, 1]):.4f}")
print(f"Accuracy: {accuracy_score(y_test, predictions):.4f}")