In [11]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from tabpfn import TabPFNClassifier

In [12]:
seed = 1314537

In [13]:
# Load the training data from CSV file
train_df = pd.read_csv('data_train.csv')

In [14]:
X_train = train_df.iloc[:, 2:]  # Features of the training data (starting from the third column)
y_train = train_df.iloc[:, 1]   # Target of the training data (second column)

In [15]:
# Initialize the classifier
clf = TabPFNClassifier(random_state=seed)
clf.fit(X_train, y_train)




In [16]:
# Predict probabilities for training data
pro_train = clf.predict_proba(X_train)[:, 1]



In [17]:
# Evaluate performance on the training set
roc_auc_train = roc_auc_score(y_train, pro_train)
accuracy_train = accuracy_score(y_train, clf.predict(X_train))

print(f'Training Set - AUC: {roc_auc_train:.3f}, Accuracy: {accuracy_train:.3f}')



Training Set - AUC: 1.000, Accuracy: 1.000


In [18]:
# Create a DataFrame with predictions for the training set
df_train = pd.DataFrame({
    'ID': train_df['ID'],
    'True': y_train,
    'Pre': pro_train
})
df_train.to_csv('TabPFN_train.csv', index=False)

In [19]:
# List of test files
test_files = ['data_test1.csv', 'data_test2.csv', 'data_test3.csv', 
                  'data_test4.csv', 'data_test5.csv', 'data_test6.csv', 'data_test7.csv', 
                  'data_test8.csv', 'data_test9.csv', 'data_test10.csv', 'data_test11.csv', 'data_test12.csv']

# Evaluate on each test file and save results
for test_file in test_files:
    test_df = pd.read_csv(test_file)
    
    # Assuming the first column is the ID, the second column is the target (label), and the rest are features
    X_test = test_df.iloc[:, 2:]  # Features of the test data (starting from the third column)
    y_test = test_df.iloc[:, 1]   # Target of the test data (second column)

    # Predict probabilities for the test data
    pro_test = clf.predict_proba(X_test)[:, 1]

    # Evaluate performance on the test set
    roc_auc_test = roc_auc_score(y_test, pro_test)
    accuracy_test = accuracy_score(y_test, clf.predict(X_test))

    # Print AUC and accuracy for the test dataset
    print(f'{test_file} - AUC: {roc_auc_test:.3f}, Accuracy: {accuracy_test:.3f}')

    # Create a DataFrame with predictions for the test set
    df_test = pd.DataFrame({
        'ID': test_df['ID'],
        'True': y_test,
        'Pre': pro_test
    })
    
    # Save the predictions for each test dataset
    df_test.to_csv(f'TabPFN_{test_file.split("/")[-1].split(".")[0]}_predictions.csv', index=False)



data_test1.csv - AUC: 0.248, Accuracy: 0.295




data_test2.csv - AUC: 1.000, Accuracy: 1.000




data_test3.csv - AUC: 1.000, Accuracy: 0.850




data_test4.csv - AUC: 1.000, Accuracy: 1.000




data_test5.csv - AUC: 1.000, Accuracy: 1.000




data_test6.csv - AUC: 1.000, Accuracy: 1.000




data_test7.csv - AUC: 1.000, Accuracy: 0.975




data_test8.csv - AUC: 1.000, Accuracy: 1.000




data_test9.csv - AUC: 1.000, Accuracy: 1.000




data_test10.csv - AUC: 1.000, Accuracy: 1.000




data_test11.csv - AUC: 1.000, Accuracy: 1.000




data_test12.csv - AUC: 1.000, Accuracy: 1.000


In [20]:
import joblib
#save model
joblib.dump(clf, 'saved_model/TabPFN.pkl')

['saved_model/TabPFN.pkl']