In [23]:
from sklearn.metrics import log_loss, roc_auc_score , accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from tabpfn import TabPFNClassifier
from tqdm import tqdm
import openml
import torch as th
import time
import pandas as pd

In [2]:
def extend_datasets(datasets, filtering = False):
    extended_datasets = {}
    i = 0
    for d in tqdm(datasets):
        if ((not 'NumberOfFeatures' in datasets[d])
                or (not 'NumberOfClasses' in datasets[d])
                or (not 'NumberOfInstances' in datasets[d])
                # or datasets[d]['NumberOfFeatures'] >= num_feats
                or datasets[d]['NumberOfClasses'] <= 0):
            print(datasets[d])
            continue
        ds = openml.datasets.get_dataset(d, download_data=False)
        if filtering and (datasets[d]['NumberOfInstances'] < 150
                          or datasets[d]['NumberOfInstances'] > 2000
                         or datasets[d]['NumberOfFeatures'] > 100
                         or datasets[d]['NumberOfClasses'] > 10):
            continue
        extended_datasets[d] = datasets[d]
        extended_datasets[d].update(ds.qualities)
    
    return extended_datasets

# All datasets
openml_list = openml.datasets.list_datasets()
openml_list = pd.DataFrame.from_dict(openml_list, orient="index")

# Select only classification
openml_list = openml_list[~openml_list['MajorityClassSize'].isna()]

# Remove duplicated datasets
duplicated = openml_list.duplicated(subset=['MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize',
       'NumberOfClasses', 'NumberOfFeatures', 'NumberOfInstances',
       'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',
       'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures'], keep='first')
openml_list = openml_list[~duplicated]

duplicated = openml_list.duplicated(subset=['name'], keep='first')
openml_list = openml_list[~duplicated]

# Filter out datasets that don't have meta information or Don't fulfill other criteria
openml_list = openml_list.to_dict(orient='index')
openml_list = pd.DataFrame.from_dict(extend_datasets(openml_list, filtering=True), orient="index")

# Filter out datasets in Open CC
# openml_list = openml_list[~openml_list.name.apply(lambda x: x in test_datasets_multiclass_df.name.values)]
# openml_list['CFI'] = openml_list.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)
# test_datasets_multiclass_df['CFI'] = test_datasets_multiclass_df.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)
# openml_list = openml_list[~openml_list.CFI.apply(lambda x: x in test_datasets_multiclass_df.CFI.values)]

# Remove time series and artificial data
openml_list = openml_list[~openml_list.name.apply(lambda x: 'autoUniv' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'fri_' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'FOREX' in x)]

# Remove datasets that overlapped with Open CC closely by name
openml_list = openml_list[~openml_list.name.apply(lambda x: 'ilpd' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'car' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'pc1' in x)]

# Remove datasets that didn't load
openml_list = openml_list[~openml_list.did.apply(lambda x: x in {1065, 40589, 41496, 770, 43097, 43148, 43255, 43595, 43786, 41701})]

# Remove class skew
openml_list = openml_list[(openml_list.MinorityClassSize / openml_list.MajorityClassSize) > 0.05]
openml_list = openml_list[openml_list.AutoCorrelation != 1]

# Remove too easy
openml_list = openml_list[openml_list.CfsSubsetEval_DecisionStumpAUC != 1]


  openml_list = openml.datasets.list_datasets()
  ds = openml.datasets.get_dataset(d, download_data=False)
100%|██████████| 1461/1461 [41:32<00:00,  1.71s/it] 


In [7]:
renamer = {'name': 'Name', 'NumberOfFeatures': '# Features', 'NumberOfSymbolicFeatures': '# Categorical Features', 'NumberOfInstances': '# Instances', 'NumberOfMissingValues': '# NaNs', 'NumberOfClasses': '# Classes', 'MinorityClassSize': 'Minority Class Size'}
print_table = openml_list
print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()
print_table['id'] = print_table.index
print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)
print_table = print_table.rename(columns=renamer)
print_table
# print(print_table.to_latex(index=False))

Unnamed: 0,Name,# Features,# Categorical Features,# Instances,# Classes,# NaNs,Minority Class Size,id
11,balance-scale,5,1,625,3,0,49,11
13,breast-cancer,10,10,286,2,9,85,13
14,mfeat-fourier,77,1,2000,10,0,200,14
15,breast-w,10,1,699,2,16,241,15
16,mfeat-karhunen,65,1,2000,10,0,200,16
...,...,...,...,...,...,...,...,...
45545,Tour-and-Travels-Customer-Churn-Prediction,7,5,954,2,60,224,45545
45557,Mammographic-Mass-Data-Set,5,3,961,2,160,445,45557
45604,dummy,7,1,1000,2,0,273,45604
45711,doa_bwin,14,3,530,2,0,176,45711


In [38]:
len(openml_list)
openml_list
# get index as list
openml_list_dids = list(openml_list.index)


In [8]:
# get the datasets from openml
datasets = {}
for did in openml_list.index:
    datasets[did] = openml.datasets.get_dataset(did)
    

  datasets[did] = openml.datasets.get_dataset(did)


In [48]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import numpy as np
import torch

def get_openml_classification(did, max_samples, multiclass=True, shuffled=True):
    dataset = openml.datasets.get_dataset(did)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="array", target=dataset.default_target_attribute
    )

    if not multiclass:
        X = X[y < 2]
        y = y[y < 2]

    if multiclass and not shuffled:
        raise NotImplementedError("This combination of multiclass and shuffling isn't implemented")

    if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
        print('Not a NP Array, skipping')
        return None, None, None, None

    if not shuffled:
        sort = np.argsort(y) if y.mean() < 0.5 else np.argsort(-y)
        pos = int(y.sum()) if y.mean() < 0.5 else int((1 - y).sum())
        X, y = X[sort][-pos * 2:], y[sort][-pos * 2:]
        y = torch.tensor(y).reshape(2, -1).transpose(0, 1).reshape(-1).flip([0]).float()
        X = torch.tensor(X).reshape(2, -1, X.shape[1]).transpose(0, 1).reshape(-1, X.shape[1]).flip([0]).float()
    else:
        order = np.arange(y.shape[0])
        np.random.seed(13)
        np.random.shuffle(order)
        X, y = torch.tensor(X[order]), torch.tensor(y[order])
    if max_samples:
        X, y = X[:max_samples], y[:max_samples]

    return X, y, list(np.where(categorical_indicator)[0]), attribute_names

def preprocess_impute(x, y, test_x, test_y, impute, one_hot, standardize, cat_features=[]):
    import warnings
    def warn(*args, **kwargs):
        pass

    warnings.warn = warn

    # x, y, test_x, test_y = x.cpu().numpy(), y.cpu().long().numpy(), test_x.cpu().numpy(), test_y.cpu().long().numpy()

    if impute:
        imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean.fit(x)
        x, test_x = imp_mean.transform(x), imp_mean.transform(test_x)

    if one_hot:
        def make_pd_from_np(x):
            data = pd.DataFrame(x)
            for c in cat_features:
                data.iloc[:, c] = data.iloc[:, c].astype('int')
            return data
        x, test_x = make_pd_from_np(x),  make_pd_from_np(test_x)
        transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_features)], remainder="passthrough")
        transformer.fit(x)
        x, test_x = transformer.transform(x), transformer.transform(test_x)

    if standardize:
        scaler = MinMaxScaler()
        scaler.fit(x)
        x, test_x = scaler.transform(x), scaler.transform(test_x)

    return x, y, test_x, test_y

In [50]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

#tab_pfn_clf = SklearnClassifier.from_torch(tab_pfn)
classifier = TabPFNClassifier(device='cuda', N_ensemble_configurations=4)

scores = {}

for did in tqdm(openml_list.index):
    entry = openml_list.loc[did]
    
    try:
        X, y, categorical_feats, attribute_names = get_openml_classification(int(entry.did), max_samples = 10000,multiclass=False, shuffled=True)
    except:
        continue
    
    with th.no_grad():
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5)
        # preprocess_impute 
        X_train, y_train, X_test, y_test = preprocess_impute(X_train, y_train, X_test, y_test, impute=True,one_hot=True, standardize=False, cat_features=categorical_feats)
        try:
            start = time.time()
            classifier.fit(X_train, y_train)
            y_eval = classifier.predict(X_test)
            y_prob = classifier.predict_proba(X_test)
            pred_time =  time.time() - start
        except ValueError as ve:
            print(ve)
            print("ve",did)
            continue
        except TypeError as te:
            print(te)
            print("te",did)
            continue
        # print(y_test.shape, y_prob.shape)
        if y_prob.shape[1]==2:
            y_prob = y_prob[:,1]
            
        roc_auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
        cross_entropy = log_loss(y_test, y_prob)
        accuracy = accuracy_score(y_test, y_eval)
        
        scores[entry.Name] = {
            "roc": roc_auc,
            "pred_time": pred_time,
            "cross_entropy": cross_entropy,
            "accuracy": accuracy
        }

  4%|▎         | 9/251 [00:00<00:17, 13.67it/s]

('The number of features for this classifier is restricted to ', 100)
ve 25


  y = y.astype(target_dtype) if isinstance(y, np.ndarray) else y
 32%|███▏      | 80/251 [00:05<00:08, 20.41it/s]

('The number of features for this classifier is restricted to ', 100)
ve 802


 33%|███▎      | 83/251 [00:05<00:08, 20.03it/s]

('The number of features for this classifier is restricted to ', 100)
ve 825


 43%|████▎     | 108/251 [00:06<00:08, 16.90it/s]

('The number of features for this classifier is restricted to ', 100)
ve 940


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
 61%|██████    | 153/251 [00:09<00:05, 17.76it/s]

('The number of features for this classifier is restricted to ', 100)
ve 6332
('The number of features for this classifier is restricted to ', 100)
ve 23381


 63%|██████▎   | 158/251 [00:10<00:06, 14.65it/s]

('The number of features for this classifier is restricted to ', 100)
ve 40663


 80%|████████  | 202/251 [00:15<00:05,  8.71it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44528
('The number of features for this classifier is restricted to ', 100)
ve 44538


 81%|████████▏ | 204/251 [00:16<00:18,  2.54it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44539


 82%|████████▏ | 205/251 [00:17<00:21,  2.14it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44540


 82%|████████▏ | 206/251 [00:18<00:23,  1.92it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44541


 82%|████████▏ | 207/251 [00:19<00:25,  1.75it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44542


 85%|████████▍ | 213/251 [00:19<00:07,  5.36it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44578
('The number of features for this classifier is restricted to ', 100)
ve 44579
('The number of features for this classifier is restricted to ', 100)
ve 44580
('The number of features for this classifier is restricted to ', 100)
ve 44581


 86%|████████▌ | 215/251 [00:20<00:05,  6.63it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44582


 91%|█████████ | 229/251 [00:21<00:02,  8.66it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44703
('The number of features for this classifier is restricted to ', 100)
ve 44708


 94%|█████████▍| 237/251 [00:22<00:01,  9.23it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44758
('The number of features for this classifier is restricted to ', 100)
ve 44759
('The number of features for this classifier is restricted to ', 100)
ve 44760
('The number of features for this classifier is restricted to ', 100)
ve 44761
('The number of features for this classifier is restricted to ', 100)
ve 44762


 97%|█████████▋| 243/251 [00:23<00:00, 14.23it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44763
('The number of features for this classifier is restricted to ', 100)
ve 44768
('The number of features for this classifier is restricted to ', 100)
ve 44769
('The number of features for this classifier is restricted to ', 100)
ve 44770
('The number of features for this classifier is restricted to ', 100)
ve 44771


 98%|█████████▊| 246/251 [00:23<00:00, 17.15it/s]

('The number of features for this classifier is restricted to ', 100)
ve 44772
('The number of features for this classifier is restricted to ', 100)
ve 44773


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
100%|██████████| 251/251 [00:23<00:00, 10.63it/s]


In [51]:
for n, score in scores.items():
    print(n, score)

11 {'roc': 0.9873353596757852, 'pred_time': 0.23940324783325195, 'cross_entropy': 0.09797866393602128, 'accuracy': 0.9704142011834319}
13 {'roc': 0.6612446958981613, 'pred_time': 0.05823373794555664, 'cross_entropy': 0.5955043248832226, 'accuracy': 0.7482517482517482}
14 {'roc': 1.0, 'pred_time': 0.07751774787902832, 'cross_entropy': 0.003318711922590225, 'accuracy': 1.0}
15 {'roc': 0.9953083835576888, 'pred_time': 0.05169534683227539, 'cross_entropy': 0.09616327990233133, 'accuracy': 0.9657142857142857}
16 {'roc': 1.0, 'pred_time': 0.05933642387390137, 'cross_entropy': 0.022156645096046645, 'accuracy': 0.99}
18 {'roc': 1.0, 'pred_time': 0.0337979793548584, 'cross_entropy': 0.004131560065284248, 'accuracy': 1.0}
22 {'roc': 1.0, 'pred_time': 0.049997568130493164, 'cross_entropy': 0.010629824524090168, 'accuracy': 0.99}
23 {'roc': 0.7980378497619878, 'pred_time': 0.07345318794250488, 'cross_entropy': 0.5103311359431215, 'accuracy': 0.7588357588357588}
29 {'roc': 0.9415676478530921, 'pred

In [52]:
cross_entropy = sum(s["cross_entropy"] for _, s in scores.items()) / len(scores)
accuracy = sum(s["accuracy"] for _, s in scores.items()) / len(scores)
print(cross_entropy)
print(accuracy)

nan
0.8302906388750743
