In [1]:
import collections
import itertools

import numpy as np
import openml
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 999)

import tabular_data_experiments.utils.data_utils
import tabular_data_experiments.utils.suites

In [2]:
all_datasets = openml.datasets.list_datasets()
all_tasks = openml.tasks.list_tasks()

In [3]:
paper_to_datasets = pd.read_csv("./paper_to_dataset.csv")
dataset_list_used = pd.read_csv("./dataset_list.csv")
dataset_list_unused = pd.read_csv("./dataset_list_unused.csv")
dataset_age = pd.read_csv("./dataset_age.csv")
for column in ("Dataset ID", "Year"):
    dataset_age[column] = dataset_age[column].astype("Int64")
for column in ("Dataset ID", "Dataset Mapping"):
    paper_to_datasets[column] = paper_to_datasets[column].astype("Int64")
for column in ("Dataset ID", ):
    dataset_list_used[column] = dataset_list_used[column].astype("Int64")

In [4]:
paper_to_datasets.head()

Unnamed: 0,Paper Key,Dataset ID,Dataset Name,Dataset Mapping,Unnamed: 4,Source,Task Type,Invalid,Note,Note to ourselves / copy to dataset sheet,reference,samples,feat,cat
0,agarwal-neurips21a,,MIMIC2,,True,,Classification,INVALID,,requires a signed agreementr,,,,
1,agarwal-neurips21a,,Credit Fraud,42397.0,True,,Classification,VALID,,,,,,
2,agarwal-neurips21a,,California Housing,,True,,Regression,VALID,,,,,,
3,agarwal-neurips21a,,FICO,,True,,Regression,INVALID,,Not sure why they treat this as a regression d...,,,,
4,arik-aaai20a,,Covertype,1596.0,True,UCI,Classification,VALID,,https://archive-beta.ics.uci.edu/dataset/31/co...,,,,


In [5]:
suite_names = paper_to_datasets.loc[:, "Paper Key"].unique()
assert len(suite_names) == 31, len(suite_names)

In [6]:
def lookup_name(dataset_id, dataset_name):
    if not pd.isna(dataset_name):
        return dataset_name.lower()
    else:
        return all_datasets[dataset_id]["name"].lower()

dataset_counts = {}
openml_suite_datasets = set()
dataset_names = {}
regression_counts = {}
for _, row in paper_to_datasets.iterrows():
    if row["Task Type"] == "Regression":
        dataset_name = row["Dataset Name"]
        if dataset_name not in regression_counts:
            regression_counts[dataset_name] = 1
        else:
            regression_counts[dataset_name] += 1
    elif row["Paper Key"] in ("gijsbers-arxiv22a", "bischl-neuripsdbt21a"):
        openml_suite_datasets.add(row["Dataset ID"])
    elif row["Invalid"] == "INVALID":
        name = row["Dataset Name"]
        if name in dataset_counts:
            dataset_counts[name] += 1
        else:
            dataset_counts[name] = 1
            dataset_names[name] = set([name])
    else:
        dataset_id = row["Dataset ID"]
        dataset_mapping = row["Dataset Mapping"]
        if pd.isna(dataset_id) and pd.isna(dataset_mapping):
            raise ValueError("Both dataset ID fields are empty", row)
        elif not pd.isna(dataset_id) and not pd.isna(dataset_mapping):
            raise ValueError("Both dataset ID fields are populated", row)
        elif pd.isna(dataset_id):
            did = int(dataset_mapping)
        else:
            did = int(dataset_id)
        name = lookup_name(did, row["Dataset Name"])
        if did not in dataset_counts:
            dataset_counts[did] = 1
            dataset_names[did] = set([name])
        else:
            dataset_counts[did] += 1
            dataset_names[did].add(name)

dataset_stats = pd.DataFrame([dataset_counts, dataset_names]).transpose()
dataset_stats.columns = ["Count", "Name"]
dataset_stats["available"] = {idx: isinstance(idx, int) for idx in dataset_stats.index}
dataset_stats["Creation"] = {idx: dataset_age[dataset_age["Dataset ID"] == idx]["Year"].tolist() if isinstance(idx, int) else [] for idx in dataset_stats.index}
dataset_stats["Creation"] = dataset_stats["Creation"].apply(lambda entry: entry[0] if len(entry) > 0 else np.NaN)

In [7]:
print("Datasets used according to Google Doc sheet 'Dataset list':", len(set(dataset_list_used["Dataset ID"].to_list())))
# print(dataset_list_unused.index)
dataset_ids_used = [idx for idx in dataset_stats.index if isinstance(idx, int)]
if len(set(dataset_ids_used) - set(dataset_list_used["Dataset ID"].to_list())) != 0:
    print("Datasets that are missing in 'Dataset list':", set(dataset_ids_used) - set(dataset_list_used["Dataset ID"].to_list()))

if len(set(dataset_list_used["Dataset ID"].to_list()) - set(dataset_ids_used) - set(openml_suite_datasets)) != 0:
    print("Datasets that are missing in 'Paper to Dataset':", set(dataset_list_used["Dataset ID"].to_list()) - set(dataset_ids_used) - set(openml_suite_datasets))

if len(openml_suite_datasets - set(dataset_list_used["Dataset ID"].to_list())) != 0:
       print("Datasets from the CC18 and the AutoML benchmark that are not in the datasets list", set(openml_suite_datasets) - set(dataset_list_used["Dataset ID"].to_list()))

if len(dataset_list_used) != len(dataset_age):
     print(
        "Datasets in dataset list and dataset age list are disjoint",
        "dataset age list misses", set(dataset_list_used["Dataset ID"]) - set(dataset_age["Dataset ID"]),
        "dataset list misses", set(dataset_age["Dataset ID"]) - set(dataset_list_used["Dataset ID"])
    )
   
only_openml = openml_suite_datasets - set(dataset_stats[dataset_stats["available"]].index)
print("Number of datasets that are only in the CC18 and AutoML benchmark", len(only_openml))
print("Number of Classification datasets (excluding CC18 and AMLB)", len(dataset_stats))
print("Number of available Classification Datasets used", dataset_stats["available"].sum())
print("Number of unavailable datasets", (~dataset_stats["available"]).sum())
print("Number of available and unavailable and OpenML datasets", len(dataset_stats) + len(only_openml))

if (~dataset_stats["available"]).sum() != len(dataset_list_unused):
    print("Number of unavailable datasets is different for different lists", (~dataset_stats["available"]).sum(), len(dataset_list_unused))

# NOTE: we do not count invalid regression datasets yet!
if set([list(entry)[0] for entry in dataset_stats[~dataset_stats["available"]]["Name"]]) != set(dataset_list_unused["Dataset Name"]):
    dataset_names_unavailable = set([list(entry)[0] for entry in dataset_stats[~dataset_stats["available"]]["Name"]])
    print(
       "Unavailable datasets are different in different tables. The following ones are not in both Tables:", 
       dataset_names_unavailable.symmetric_difference(set(dataset_list_unused["Dataset Name"]))
    )

print("Number of Regression Datasets", len(regression_counts))

Datasets used according to Google Doc sheet 'Dataset list': 187
Number of datasets that are only in the CC18 and AutoML benchmark 20
Number of Classification datasets (excluding CC18 and AMLB) 191
Number of available Classification Datasets used 167
Number of unavailable datasets 24
Number of available and unavailable and OpenML datasets 211
Number of Regression Datasets 54


In [8]:
sorted_stats = dataset_stats.sort_values(by=["Count"], ascending=False)
sorted_stats["Creation"] = sorted_stats["Creation"].astype("Int64")
sorted_stats.head()

Unnamed: 0,Count,Name,available,Creation
1596,12,{covertype},True,1998
1590,10,"{adult income, adult roc, adult}",True,1996
45575,6,{epsilon},True,2008
45570,6,{higgs},True,2014
4538,5,"{gesture phase, gesture phase prediction, gest...",True,2014


In [9]:
print(sorted_stats.iloc[:20].to_markdown())

|        |   Count | Name                                                                               | available   | Creation   |
|:-------|--------:|:-----------------------------------------------------------------------------------|:------------|:-----------|
| 1596   |      12 | {'covertype'}                                                                      | True        | 1998       |
| 1590   |      10 | {'adult income', 'adult roc', 'adult'}                                             | True        | 1996       |
| 45575  |       6 | {'epsilon'}                                                                        | True        | 2008       |
| 45570  |       6 | {'higgs'}                                                                          | True        | 2014       |
| 4538   |       5 | {'gesture phase', 'gesture phase prediction', 'gesturephasesegmentationprocessed'} | True        | 2014       |
| 45062  |       5 | {'shrutime', 'churn modelling'}                 

In [10]:
print(sorted_stats.iloc[:20].to_latex())

\begin{tabular}{lllrr}
\toprule
 & Count & Name & available & Creation \\
\midrule
1596 & 12 & {'covertype'} & True & 1998 \\
1590 & 10 & {'adult income', 'adult roc', 'adult'} & True & 1996 \\
45575 & 6 & {'epsilon'} & True & 2008 \\
45570 & 6 & {'higgs'} & True & 2014 \\
4538 & 5 & {'gesture phase', 'gesture phase prediction', 'gesturephasesegmentationprocessed'} & True & 2014 \\
45062 & 5 & {'shrutime', 'churn modelling'} & True & 2019 \\
23512 & 5 & {'higgs small'} & True & 2014 \\
31 & 5 & {'german credit', 'credit-g'} & True & 1994 \\
1464 & 4 & {'blood-transfusion-service-center', 'blood-transfusion'} & True & 2008 \\
42397 & 4 & {'credit', 'c.c.fraudd', 'credit fraud'} & True & 2015 \\
1494 & 4 & {'qsar-biodeg'} & True & 2013 \\
37 & 4 & {'diabetes'} & True & 1988 \\
Click & 4 & {'Click'} & False & NaN \\
45554 & 4 & {'fico'} & True & 2018 \\
40975 & 4 & {'car'} & True & 1988 \\
1461 & 3 & {'bank-marketing'} & True & 2011 \\
MIMIC2 & 3 & {'MIMIC2'} & False & NaN \\
6332 & 3 & {

In [11]:
sorted_stats_all = sorted_stats.copy().loc[sorted_stats["available"]].sort_index()
sorted_stats_all.index = ["\href{https://openml.org/d/%d}{%d}" % (dataset_id, dataset_id) for dataset_id in sorted_stats_all.index]
sorted_stats_all["Name"] = [list(name)[0] for name in sorted_stats_all["Name"]]
print(
    sorted_stats_all.
    drop("available", axis=1).
    to_latex().
    replace("_", "\_")
)

\begin{tabular}{lllr}
\toprule
 & Count & Name & Creation \\
\midrule
\href{https://openml.org/d/2}{2} & 1 & anneal & 1990 \\
\href{https://openml.org/d/3}{3} & 2 & kr-vs-kp & 1983 \\
\href{https://openml.org/d/5}{5} & 1 & arrhythmia & 1998 \\
\href{https://openml.org/d/6}{6} & 1 & letter & 1991 \\
\href{https://openml.org/d/11}{11} & 2 & balance-scale & 1976 \\
\href{https://openml.org/d/12}{12} & 2 & mfeat-factors & 1998 \\
\href{https://openml.org/d/13}{13} & 3 & breast-cancer & 1988 \\
\href{https://openml.org/d/14}{14} & 2 & mfeat-fourier & 1998 \\
\href{https://openml.org/d/15}{15} & 2 & breast-w & 1990 \\
\href{https://openml.org/d/16}{16} & 2 & mfeat-karhunen & 1998 \\
\href{https://openml.org/d/18}{18} & 2 & mfeat-morphological & 1998 \\
\href{https://openml.org/d/22}{22} & 2 & mfeat-zernike & 1998 \\
\href{https://openml.org/d/23}{23} & 2 & cmc & 1987 \\
\href{https://openml.org/d/25}{25} & 1 & horse-colic  & 1989 \\
\href{https://openml.org/d/28}{28} & 2 & optdigits & 1995 \

In [12]:
def lookup_name(dataset_id):
    try:
        dataset_id = int(dataset_id)
        return "%s (%s)" % (all_datasets[dataset_id]["name"].lower(), dataset_id)
    except:
        return dataset_id

filtered_stuff = collections.defaultdict(lambda: collections.defaultdict(int))
# This should maximally be set to 6, unless one has more than 64GB of RAM
max_combinations = 6
for i in range(2, max_combinations):
    n_grams = collections.defaultdict(int)
    used_where = collections.defaultdict(list)
    for key, value in paper_to_datasets.groupby("Paper Key"):
        # Adding these back in takes an incredible amount of memory
        if key in ("gijsbers-arxiv22a", "bischl-neuripsdbt21a"):
            continue
        dataset_ids = []
        for _, row in value.iterrows():
            if row["Invalid"] == "INVALID":
                name = row["Dataset Name"]
                dataset_ids.append(name)
            elif row["Task Type"] == "Regression":
                continue
            else:
                dataset_id = row["Dataset ID"]
                dataset_mapping = row["Dataset Mapping"]
                if pd.isna(dataset_id) and pd.isna(dataset_mapping):
                    raise ValueError("Both dataset ID fields are empty", row)
                elif not pd.isna(dataset_id) and not pd.isna(dataset_mapping):
                    raise ValueError("Both dataset ID fields are populated", row)
                elif pd.isna(dataset_id):
                    did = int(dataset_mapping)
                else:
                    did = int(dataset_id)
                dataset_ids.append(did)
        current_n_grams = list(itertools.combinations(dataset_ids, i))
        # print(key, len(dataset_ids), len(current_n_grams))
        for n_gram in current_n_grams:
            n_gram = tuple(sorted([str(n) for n in n_gram]))
            n_grams[n_gram] += 1
            used_where[n_gram].append(key)

    # now check how many dataset combinations we have per min appearance
    for j in range(2, max_combinations + 1):
        filtered = {key: value for key, value in n_grams.items() if value >= j}
        filtered_list = sorted(list(filtered.items()), key=lambda t: t[1], reverse=True)
        if len(filtered_list) > 0:
            highest_value = filtered_list[0][1]
            print(i, j, len(filtered_list), ", ".join(["(" + ", ".join([lookup_name(d) for d in fl[0]]) + ")" for fl in filtered_list[:20] if fl[1] == highest_value]))
        filtered_stuff[i][j] += len(filtered_list)

print(pd.DataFrame(filtered_stuff).to_latex())


2 2 687 (covertype (1596), higgs (45570)), (covertype (1596), epsilon (45575)), (adult (1590), higgs (23512))
2 3 38 (covertype (1596), higgs (45570)), (covertype (1596), epsilon (45575)), (adult (1590), higgs (23512))
2 4 8 (covertype (1596), higgs (45570)), (covertype (1596), epsilon (45575)), (adult (1590), higgs (23512))
2 5 3 (covertype (1596), higgs (45570)), (covertype (1596), epsilon (45575)), (adult (1590), higgs (23512))
3 2 4925 (blood-transfusion-service-center (1464), credit-approval (29), cylinder-bands (6332)), (blood-transfusion-service-center (1464), credit-g (31), vehicle (54)), (credit-g (31), car (40975), vehicle (54)), (qsar-biodeg (1494), eucalyptus (188), credit-g (31)), (eucalyptus (188), credit-g (31), car (40975)), (blood-transfusion-service-center (1464), credit-g (31), car (40975)), (qsar-biodeg (1494), credit-g (31), car (40975)), (adult (1590), credit-g (31), car (40975)), (blood-transfusion-service-center (1464), car (40975), vehicle (54)), (qsar-biodeg (

In [13]:
# Numbers from the paper
suite_to_num_datasets = {
    # Agarwal et al. uses Compas as a fifth dataset to demonstrate the intellegibility of the method. 
    # Also, it contains further datasets for multi-task learning, which we do not consider. 
    # Last but least, it incorrectly treat the FICO dataset as a regression dataset.
    "agarwal-neurips21a":  (2, 2, 0, "New", "NAM"),
    # Uses 6 synthetic datasets, 2 UCI datasets for interpretability as other, also does SSL on one of the main datasets. 
    # Uses 1 forecasting dataset which we count as regression in this list
    "arik-aaai20a": (3, 2, 8, "New", "TabNet"),
    # Uses 69 datasets from the CC18, drops MNIST, FASHION-MNIST and CIFAR10
    "bahri-iclr22a": (69, 0, 0, "New", "SCARF"),
    "bischl-neuripsdbt21a": (72, 0, 0, "Suite", None),
    "borisov-iclr23a": (0, 0, 6, "Data", "GReaT"),
    "borisov-tnnls22a": (4, 1, 0, "Comp", None),
    # Use the adult dataset for interpretation and classification. Only counting it for classification.
    "buturovic-biorxiv20a": (1, 0, 0, "New", "TAC"),
    "cai-sigmod21a": (5, 0, 0, "New", "ARM-Net"),
    "chen-aaai22a": (4, 3, 0, "New", "DANET"),
    # Uses three image datasets as well
    "dubey-neurips22a": (9, 4, 3, "New", "SPAM"),
    "gijsbers-arxiv22a": (71, 0, 0, "Suite", None),
    # 4 datasets that weren't interesting + 1 synthetic
    "gorishniy-neurips21a": (7, 4, 5, "Comp", "FT-Transformer"),
    # Same synthetic dataset as above
    "gorishniy-neurips22a": (7, 4, 1, "New", "multiple"),
    # Conduct further studies on dataset changes, not sure how relevant these are to mention
    # The paper mentions 18 and 27 as distinct datasets, but the studies themselves
    # have more papers because some datasets are transformed to purely numerical
    "grinsztajn-neuripsdbt22a": (21, 18 + 17, 0, "Comp", None),
    # Conduct generalization studies on 18 datasets, additional classification studies on 2 more datasets from the AutoML benchmark, and 150 validation datasets
    "hollmann-iclr23a": (30, 0, 170, "New", "TabPFN"),
    # One of the datasets is actually a multi-label dataset
    "huang-arxiv20a": (20, 0, 0, "New", "TabTransformer"),
    "joseph-arxiv22a": (3, 2, 0, "New", "GATE"),
    "kadra-neurips21a": (40, 0, 0, "New", "RegCocktail"),
    # That additional data is CIFAR-10
    "kossen-neurips21a": (6, 4, 2, "New", "NPT"),
    "kotelnikov-openreview23a": (0, 0, 16, "Data", None),
    "levin-iclr23a": (1, 0, 2, "New", "multiple"),
    "popov-iclr20a": (3, 3, 0, "New", "NODE"),
    # Use three image datasets
    "radenovic-neurips22a": (8, 4, 3, "New", "NBM"),
    "rubachev-openreview23a": (6, 5, 1, "New", "multiple"),
    "sarkar-isa22a": (8, 0, 0, "New", "XGBNet"),
    # 16 small and 4 medium
    "schaefl-openreview22a": (16, 0, 4, "New", "Hopular"),
    # Also does ensembling of NNs and XGB, mistakes one regression dataset for a classification dataset
    "shwartz-ziv-if22a": (9, 2, 0, "Comp", None),
    # The additional dataset is MNIST used to interpret the attention mechanism
    "somepalli-openreview22a": (20, 10, 1, "New", "SAINT"),
    "sun-cvprw19a": (4, 0, 0, "New", "SuperTML"),
    # 6 blood cells, 2 clinical data and 3 UCI
    "yoon-neurips20a": (11, 0, 0, "New", "VIME"),
    "zhu-sr21a": (0, 2, 0, "New", "IGDT"),
}
table_to_print = {}
for suite in suite_to_num_datasets:

    dataset_subset = paper_to_datasets[paper_to_datasets["Paper Key"] == suite]
    classification_subset = dataset_subset[dataset_subset["Task Type"] == "Classification"]
    classification_subset_dataset_ids = set()
    for _, row in classification_subset.iterrows():
        if not pd.isna(row["Dataset ID"]) and not pd.isna(row["Dataset Mapping"]):
            raise ValueError(row)
        elif not pd.isna(row["Dataset ID"]):
            classification_subset_dataset_ids.add(row["Dataset ID"])
        elif not pd.isna(row["Dataset Mapping"]):
            classification_subset_dataset_ids.add(row["Dataset Mapping"])
        elif row["Invalid"] == "INVALID":
            continue
        else:
            raise ValueError(row)
        
    #unused_classification_datasets = dataset_list_unused.query(
    #    "Reason in ['Private', 'Synthetic', 'OpenML', 'Unclear Format', 'String', 'Multilabel', 'Date', 'Unavailable', 'Too few classes', 'Proprietary']"
    #)
    #unused_other_datasets = dataset_list_unused.query("Reason == 'Forcasting'")
    #assert len(unused_classification_datasets) + len(unused_other_datasets) == len(dataset_list_unused)
    #if len(unused_other_datasets) > 0:
    #    unused_classification_datasets = unused_classification_datasets.query("Dataset Name not in @unused_other_datasets")
    unused_classification_datasets = [dataset_name for dataset_name in dataset_list_unused["Dataset Name"] if dataset_name in classification_subset.query("Invalid == 'INVALID'")["Dataset Name"].to_list()]
    regression_subset = dataset_subset[dataset_subset["Task Type"] == "Regression"]
    other_subset = dataset_subset[dataset_subset["Invalid"] == "Invalid"]

    if suite in tabular_data_experiments.utils.suites.CUSTOM_SUITES:
        dataset_ids_in_code = set([all_tasks[tid]["did"] for tid in tabular_data_experiments.utils.suites.CUSTOM_SUITES[suite]])
    else:
        dataset_ids_in_code = set()
    dataset_ids_table = set(classification_subset_dataset_ids)
    num_entries_in_code = len(dataset_ids_in_code)
    num_entries_in_table = len(dataset_ids_table)
    if num_entries_in_code != num_entries_in_table:
        print(suite, num_entries_in_code, num_entries_in_table)
        print("Entries in code", dataset_ids_in_code)
        print("Entries in google doc", classification_subset_dataset_ids)
        print("Entries missing from code", dataset_ids_table - dataset_ids_in_code)
        print("Entries missing from google doc", dataset_ids_in_code - dataset_ids_table)

    print("###")
    print(suite)
    if suite_to_num_datasets[suite][3] != "Data":
        if len(classification_subset_dataset_ids) + len(unused_classification_datasets) == suite_to_num_datasets[suite][0]:
            print("Classification okay")
        else:
            print(len(classification_subset_dataset_ids), len(unused_classification_datasets), suite_to_num_datasets[suite][0])
        if regression_subset.shape[0] == suite_to_num_datasets[suite][1]:
            print("Regression okay")
        else:
            print(regression_subset.shape[0], suite_to_num_datasets[suite][1])

    if suite_to_num_datasets[suite][3] == "Suite":
        continue
    elif suite_to_num_datasets[suite][3] == "Data":
        table_to_print[suite] = {
            "Type": suite_to_num_datasets[suite][3],
            "Method": suite_to_num_datasets[suite][4] if suite_to_num_datasets[suite][4] else "-",
            "Paper": "\\citet{" + suite + "}",
            "Classification": 0,
            "Regression": 0,
            "Invalid Classification": 0,
            "Other (according to paper)": suite_to_num_datasets[suite][2]
        }
    elif suite_to_num_datasets[suite][3] in ("New", "Comp"):
        table_to_print[suite] = {
            "Type": suite_to_num_datasets[suite][3],
            "Method": suite_to_num_datasets[suite][4] if suite_to_num_datasets[suite][4] else "-",
            "Paper": "\\citet{" + suite + "}",
            "Classification": len(classification_subset),
            "Regression": len(regression_subset),
            "Invalid Classification": len(unused_classification_datasets),
            "Other (according to paper)": suite_to_num_datasets[suite][2]
        }
    else:
        raise ValueError(suite_to_num_datasets[suite][3])

table_for_paper = pd.DataFrame(table_to_print).transpose()
means = []
medians = []
table_for_paper_wo_data = table_for_paper[table_for_paper["Type"] != "Data"]
print(table_for_paper.shape, table_for_paper_wo_data.shape)
for column in ("Classification", "Regression", "Invalid Classification", "Other (according to paper)"):
    table_for_paper[column] = table_for_paper[column].astype(int)
    means.append(table_for_paper_wo_data[column].mean())
    medians.append(table_for_paper_wo_data[column].median())

table_for_paper.loc["Mean"] = ["Mean", "", ""] + means
table_for_paper.loc["Median"] = ["Median", "", ""] + medians
print(table_for_paper)
print(table_for_paper.to_latex(index=False).replace(".000000", ""))

###
agarwal-neurips21a
Classification okay
Regression okay
###
arik-aaai20a
Classification okay
Regression okay
###
bahri-iclr22a
Classification okay
Regression okay
###
bischl-neuripsdbt21a
Classification okay
Regression okay
###
borisov-iclr23a
###
borisov-tnnls22a
Classification okay
Regression okay
###
buturovic-biorxiv20a
Classification okay
Regression okay
###
cai-sigmod21a
Classification okay
Regression okay
###
chen-aaai22a
Classification okay
Regression okay
###
dubey-neurips22a
Classification okay
Regression okay
###
gijsbers-arxiv22a
Classification okay
Regression okay
###
gorishniy-neurips21a
Classification okay
Regression okay
###
gorishniy-neurips22a
Classification okay
Regression okay
###
grinsztajn-neuripsdbt22a
Classification okay
Regression okay
###
hollmann-iclr23a
Classification okay
Regression okay
###
huang-arxiv20a
Classification okay
Regression okay
###
joseph-arxiv22a
Classification okay
Regression okay
###
kadra-neurips21a
Classification okay
Regression okay
#

In [14]:
# This cell checks whether the paper introducing data generators feature any datasets not available in any other suites - and indeed, they do...
pure_data_suites = set()
for suite in suite_to_num_datasets:
    if suite_to_num_datasets[suite][3] == "Data":
        pure_data_suites = pure_data_suites | set(tabular_data_experiments.utils.suites.COLLECTION_KEYS)
for suite in tabular_data_experiments.utils.suites.COLLECTION_KEYS:
    if suite in ("kotelnikov-openreview23a", "borisov-iclr23a"):
        continue
    elif suite not in tabular_data_experiments.utils.suites.CUSTOM_SUITES:
        print(suite)
        continue
    else:
        pure_data_suites = pure_data_suites - set(tabular_data_experiments.utils.suites.COLLECTION_KEYS)
print(pure_data_suites)

set()


In [15]:
# This cell further describes the suites in terms of the number of categorical and numerical datasets
suite_statistics = {}
for suite in suite_to_num_datasets:
    n_numeric_datasets = 0
    n_mixed_datests = 0
    n_categorical_datasets = 0
    n_binary = 0
    n_multiclass = 0
    balancedness = []
    if suite not in tabular_data_experiments.utils.suites.CUSTOM_SUITES:
        continue
    elif suite_to_num_datasets[suite][3] == "Data":
        continue
    for task_id in tabular_data_experiments.utils.suites.CUSTOM_SUITES[suite]:
        n_symbolic = all_tasks[task_id]['NumberOfSymbolicFeatures']
        n_numeric = all_tasks[task_id]['NumberOfNumericFeatures']
        if n_symbolic == 1:
            n_numeric_datasets += 1
        elif n_numeric == 0:
            n_categorical_datasets += 1
        else:
            n_mixed_datests += 1
        try:
            if all_tasks[task_id]["NumberOfClasses"] == 2:
                n_binary += 1
            else: 
                n_multiclass += 1
        except KeyError:
            print(suite, task_id, all_tasks[task_id], flush=True)
            pass
        try:
            balancedness.append(all_tasks[task_id]["MajorityClassSize"] / all_tasks[task_id]["NumberOfInstances"])
        except KeyError:
            pass

    suite_statistics[suite] = {
        'Numeric': n_numeric_datasets, 
        "Categorical": n_categorical_datasets, 
        "Mixed": n_mixed_datests,
        "Binary": n_binary,
        "Multiclass": n_multiclass, 
        "Balancedness": np.mean(balancedness)   
    }

suite_statistics = pd.DataFrame(suite_statistics).transpose()
suite_statistics["Ratio"] = suite_statistics["Numeric"] / (suite_statistics["Categorical"] + suite_statistics["Mixed"])
print(suite_statistics)
print(suite_statistics.mean())
print(suite_statistics.median())

huang-arxiv20a 361679 {'tid': 361679, 'ttid': <TaskType.SUPERVISED_CLASSIFICATION: 1>, 'did': 45565, 'name': 'Adult-Census-Income', 'task_type': 'Supervised Classification', 'status': 'active', 'estimation_procedure': '4-fold Crossvalidation', 'source_data': '45565', 'target_feature': 'income', 'NumberOfFeatures': 15, 'NumberOfInstances': 32561, 'NumberOfInstancesWithMissingValues': 2399, 'NumberOfMissingValues': 4262, 'NumberOfNumericFeatures': 6, 'NumberOfSymbolicFeatures': 9}
kossen-neurips21a 361634 {'tid': 361634, 'ttid': <TaskType.SUPERVISED_CLASSIFICATION: 1>, 'did': 4535, 'name': 'Census-Income', 'task_type': 'Supervised Classification', 'status': 'active', 'estimation_procedure': '4-fold Crossvalidation', 'source_data': '4535', 'target_feature': 'V42', 'MaxNominalAttDistinctValues': 51, 'NumberOfFeatures': 42, 'NumberOfInstances': 299285, 'NumberOfInstancesWithMissingValues': 0, 'NumberOfMissingValues': 0, 'NumberOfNumericFeatures': 13, 'NumberOfSymbolicFeatures': 29}
        

In [16]:
for task_id in tabular_data_experiments.utils.suites.CUSTOM_SUITES["grinsztajn-neuripsdbt22a_unpreprocessed"]:
    print(all_tasks[task_id])

{'tid': 361500, 'ttid': <TaskType.SUPERVISED_CLASSIFICATION: 1>, 'did': 151, 'name': 'electricity', 'task_type': 'Supervised Classification', 'status': 'active', 'estimation_procedure': '4-fold Crossvalidation', 'source_data': '151', 'target_feature': 'class', 'MajorityClassSize': 26075, 'MaxNominalAttDistinctValues': 7, 'MinorityClassSize': 19237, 'NumberOfClasses': 2, 'NumberOfFeatures': 9, 'NumberOfInstances': 45312, 'NumberOfInstancesWithMissingValues': 0, 'NumberOfMissingValues': 0, 'NumberOfNumericFeatures': 7, 'NumberOfSymbolicFeatures': 2}
{'tid': 361516, 'ttid': <TaskType.SUPERVISED_CLASSIFICATION: 1>, 'did': 4134, 'name': 'Bioresponse', 'task_type': 'Supervised Classification', 'status': 'active', 'estimation_procedure': '4-fold Crossvalidation', 'source_data': '4134', 'target_feature': 'target', 'MajorityClassSize': 2034, 'MaxNominalAttDistinctValues': 2, 'MinorityClassSize': 1717, 'NumberOfClasses': 2, 'NumberOfFeatures': 1777, 'NumberOfInstances': 3751, 'NumberOfInstancesW