In [3]:
import numpy as np

import openml
import pandas as pd

In [21]:
from tqdm import tqdm

from datasets import load_openml_list, test_dids_classification, valid_large_classification, open_cc_dids, open_cc_valid_dids


In [5]:
%load_ext autoreload

%autoreload 2

### Prepare test datasets

In [6]:
renamer = {'name': 'Name', 'NumberOfFeatures': '# Features', 'NumberOfSymbolicFeatures': '# Categorical Features', 'NumberOfInstances': '# Instances', 'NumberOfMissingValues': '# NaNs', 'NumberOfClasses': '# Classes', 'MinorityClassSize': 'Minority Class Size'}


In [11]:
openml.study.list_suites()

  openml.study.list_suites()


{99: {'id': 99,
  'alias': 'OpenML-CC18',
  'main_entity_type': 'task',
  'name': 'OpenML-CC18 Curated Classification benchmark',
  'status': 'active',
  'creation_date': '2019-02-21 18:47:13',
  'creator': 1},
 225: {'id': 225,
  'alias': 'OpenML-friendly',
  'main_entity_type': 'task',
  'name': 'OpenML100-friendly',
  'status': 'active',
  'creation_date': '2019-09-16 19:41:46',
  'creator': 1},
 353: {'id': 353,
  'alias': '8f0ea660163b436bbd4abd49665c7b1d',
  'main_entity_type': 'task',
  'name': 'OpenML-CTR23 - A curated tabular regression benchmarking suite',
  'status': 'active',
  'creation_date': '2023-05-31 16:39:49',
  'creator': 30127}}

In [4]:
suite = openml.study.get_suite(suite_id=99)
tasks = openml.tasks.list_tasks(output_format="dataframe")

In [6]:
# Retrieve task IDs from the suite
suite_task_ids = suite.tasks

# Filter the tasks DataFrame to only include tasks from the suite
suite_tasks = tasks[tasks['tid'].isin(suite_task_ids)]

print(suite_tasks.head())

print(suite_tasks.info())

print(suite_tasks.describe())

    tid                                ttid  did           name  \
1     3  TaskType.SUPERVISED_CLASSIFICATION    3       kr-vs-kp   
4     6  TaskType.SUPERVISED_CLASSIFICATION    6         letter   
9    11  TaskType.SUPERVISED_CLASSIFICATION   11  balance-scale   
10   12  TaskType.SUPERVISED_CLASSIFICATION   12  mfeat-factors   
12   14  TaskType.SUPERVISED_CLASSIFICATION   14  mfeat-fourier   

                    task_type  status     estimation_procedure  \
1   Supervised Classification  active  10-fold Crossvalidation   
4   Supervised Classification  active  10-fold Crossvalidation   
9   Supervised Classification  active  10-fold Crossvalidation   
10  Supervised Classification  active  10-fold Crossvalidation   
12  Supervised Classification  active  10-fold Crossvalidation   

   evaluation_measures source_data target_feature  ...  \
1                  NaN           3          class  ...   
4                  NaN           6          class  ...   
9                  NaN    

In [None]:


# Loop through the suite task IDs and fetch the dataset for each task
for task_id in suite_task_ids:
    task = openml.tasks.get_task(task_id)
    dataset = task.get_dataset()
    
    # Now you can work with the dataset object, e.g., get the data as a pandas DataFrame
    data, _, _, _ = dataset.get_data(target=dataset.default_target_attribute)
    print(f"Task ID: {task_id}")
    print(data.head())  # Print the first few rows of the dataset



Task ID: 3
  bkblk bknwy bkon8 bkona bkspr bkxbq bkxcr bkxwp blxwp bxqsq  ... skrxp  \
0     f     f     f     f     f     f     f     f     f     f  ...     f   
1     f     f     f     f     t     f     f     f     f     f  ...     f   
2     f     f     f     f     t     f     t     f     f     f  ...     f   
3     f     f     f     f     f     f     f     f     t     f  ...     f   
4     f     f     f     f     f     f     f     f     f     f  ...     f   

  spcop stlmt thrsk wkcti wkna8 wknck wkovl wkpos wtoeg  
0     f     f     f     f     f     f     t     t     n  
1     f     f     f     f     f     f     t     t     n  
2     f     f     f     f     f     f     t     t     n  
3     f     f     f     f     f     f     t     t     n  
4     f     f     f     f     f     f     t     t     n  

[5 rows x 36 columns]




Task ID: 6
   x-box  y-box  width  high  onpix  x-bar  y-bar  x2bar  y2bar  xybar  x2ybr  \
0      2      4      4     3      2      7      8      2      9     11      7   
1      4      7      5     5      5      5      9      6      4      8      7   
2      7     10      8     7      4      8      8      5     10     11      2   
3      4      9      5     7      4      7      7     13      1      7      6   
4      6      7      8     5      4      7      6      3      7     10      7   

   xy2br  x-ege  xegvy  y-ege  yegvx  
0      7      1      8      5      6  
1      9      2      9      7     10  
2      8      2      5      5     10  
3      8      3      8      0      8  
4      9      3      8      3      7  
Task ID: 11
   left-weight  left-distance  right-weight  right-distance
0          1.0            1.0           1.0             1.0
1          1.0            1.0           1.0             2.0
2          1.0            1.0           1.0             3.0
3          1.0  



Task ID: 12
   att1  att2  att3  att4  att5  att6  att7  att8  att9  att10  ...  att207  \
0    98   236   531   673   607   647     2     9     3      6  ...     474   
1   121   193   607   611   585   665     7     9     2      4  ...     520   
2   115   141   590   605   557   627    12     6     3      3  ...     535   
3    90   122   627   692   607   642     0     6     4      5  ...     576   
4   157   167   681   666   587   666     8     6     1      4  ...     594   

   att208  att209  att210  att211  att212  att213  att214  att215  att216  
0     536     628     632      18      36       8      15      12      13  
1     458     570     634      15      32      11      13      15      11  
2     498     572     656      20      35      16      14      13       6  
3     549     628     621      16      35       7      12      15       9  
4     525     568     653      16      35      10      15      13      13  

[5 rows x 216 columns]
Task ID: 14
       att1      att2



Task ID: 15
   Clump_Thickness  Cell_Size_Uniformity  Cell_Shape_Uniformity  \
0              5.0                   1.0                    1.0   
1              5.0                   4.0                    4.0   
2              3.0                   1.0                    1.0   
3              6.0                   8.0                    8.0   
4              4.0                   1.0                    1.0   

   Marginal_Adhesion  Single_Epi_Cell_Size  Bare_Nuclei  Bland_Chromatin  \
0                1.0                   2.0          1.0              3.0   
1                5.0                   7.0         10.0              3.0   
2                1.0                   2.0          2.0              3.0   
3                1.0                   3.0          4.0              3.0   
4                3.0                   2.0          1.0              3.0   

   Normal_Nucleoli  Mitoses  
0              1.0      1.0  
1              2.0      1.0  
2              1.0      1.0  
3       



Task ID: 18
   att1  att2  att3        att4      att5         att6
0   1.0   0.0   0.0  133.150861  1.311693  1620.221779
1   1.0   0.0   0.0  126.724861  1.302745  1609.334822
2   1.0   0.0   0.0  131.173861  1.319031  1568.978435
3   1.0   0.0   0.0  129.478861  1.270878  1695.055281
4   1.0   0.0   0.0  127.262861  1.329637  1647.720235
Task ID: 22
       att1      att2       att3       att4        att5        att6  \
0  0.011033  0.831466  15.351804  75.806559  171.554214  490.156556   
1  0.038271  1.166746  10.526913  42.369276   85.187116  420.360566   
2  0.042698  1.225007   8.273804  31.744786   54.448177  404.103204   
3  0.032418  1.638247  19.205283  51.196682   57.181760  429.052011   
4  0.015866  0.611561   8.627839  37.325052   48.509025  459.909634   

         att7      att8      att9      att10  ...     att38      att39  \
0  206.416027  0.122135  2.601646  11.472709  ...  1.713529  33.810340   
1  253.569574  0.033657  0.390566  11.700830  ...  2.590208  35.400531 



   Wifes_age Wifes_education Husbands_education  Number_of_children_ever_born  \
0         24               2                  3                             3   
1         45               1                  3                            10   
2         43               2                  3                             7   
3         42               3                  2                             9   
4         36               3                  3                             8   

  Wifes_religion Wifes_now_working%3F Husbands_occupation  \
0              1                    1                   2   
1              1                    1                   3   
2              1                    1                   3   
3              1                    1                   3   
4              1                    1                   3   

  Standard-of-living_index Media_exposure  
0                        3              0  
1                        4              0  
2             



Task ID: 28
   input1  input2  input3  input4  input5  input6  input7  input8  input9  \
0     0.0     1.0     6.0    15.0    12.0     1.0     0.0     0.0     0.0   
1     0.0     0.0    10.0    16.0     6.0     0.0     0.0     0.0     0.0   
2     0.0     0.0     8.0    15.0    16.0    13.0     0.0     0.0     0.0   
3     0.0     0.0     0.0     3.0    11.0    16.0     0.0     0.0     0.0   
4     0.0     0.0     5.0    14.0     4.0     0.0     0.0     0.0     0.0   

   input10  ...  input55  input56  input57  input58  input59  input60  \
0      7.0  ...      0.0      0.0      0.0      0.0      6.0     14.0   
1      7.0  ...      3.0      0.0      0.0      0.0     10.0     16.0   
2      1.0  ...      0.0      0.0      0.0      0.0      9.0     14.0   
3      0.0  ...      0.0      0.0      0.0      0.0      0.0      1.0   
4      0.0  ...     12.0      0.0      0.0      0.0      4.0     12.0   

   input61  input62  input63  input64  
0      7.0      1.0      0.0      0.0  
1     



Task ID: 31
  checking_status  duration                  credit_history  \
0              <0         6  critical/other existing credit   
1        0<=X<200        48                   existing paid   
2     no checking        12  critical/other existing credit   
3              <0        42                   existing paid   
4              <0        24              delayed previously   

               purpose  credit_amount    savings_status employment  \
0             radio/tv         1169.0  no known savings        >=7   
1             radio/tv         5951.0              <100     1<=X<4   
2            education         2096.0              <100     4<=X<7   
3  furniture/equipment         7882.0              <100     4<=X<7   
4              new car         4870.0              <100     1<=X<4   

   installment_commitment     personal_status other_parties  residence_since  \
0                       4         male single          none                4   
1                       2  f



Task ID: 32
   input1  input2  input3  input4  input5  input6  input7  input8  input9  \
0    47.0   100.0    27.0    81.0    57.0    37.0    26.0     0.0     0.0   
1     0.0    89.0    27.0   100.0    42.0    75.0    29.0    45.0    15.0   
2     0.0    57.0    31.0    68.0    72.0    90.0   100.0   100.0    76.0   
3     0.0   100.0     7.0    92.0     5.0    68.0    19.0    45.0    86.0   
4     0.0    67.0    49.0    83.0   100.0   100.0    81.0    80.0    60.0   

   input10  input11  input12  input13  input14  input15  input16  
0     23.0     56.0     53.0    100.0     90.0     40.0     98.0  
1     15.0     37.0      0.0     69.0      2.0    100.0      6.0  
2     75.0     50.0     51.0     28.0     25.0     16.0      0.0  
3     34.0    100.0     45.0     74.0     23.0     67.0      0.0  
4     60.0     40.0     40.0     33.0     20.0     47.0      0.0  
Task ID: 37
   preg   plas  pres  skin   insu  mass   pedi   age
0   6.0  148.0  72.0  35.0    0.0  33.6  0.627  50.0
1   1



Task ID: 43
   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  word_freq_conference  char_freq_%3B  \
0             0.00            0.00  ...         



Task ID: 45
  attribute_1 attribute_2 attribute_3 attribute_4 attribute_5 attribute_6  \
0           C           C           A           G           C           T   
1           A           G           A           C           C           C   
2           G           A           G           G           T           G   
3           G           G           G           C           T           G   
4           G           C           T           C           A           G   

  attribute_7 attribute_8 attribute_9 attribute_10  ... attribute_51  \
0           G           C           A            T  ...            A   
1           G           C           C            G  ...            G   
2           A           A           G            G  ...            C   
3           C           G           T            T  ...            G   
4           C           C           C            C  ...            C   

  attribute_52 attribute_53 attribute_54 attribute_55 attribute_56  \
0            G        



Task ID: 53
   COMPACTNESS  CIRCULARITY  DISTANCE_CIRCULARITY  RADIUS_RATIO  \
0           95           48                    83         178.0   
1           91           41                    84         141.0   
2          104           50                   106         209.0   
3           93           41                    82         159.0   
4           85           44                    70         205.0   

   PR.AXIS_ASPECT_RATIO  MAX.LENGTH_ASPECT_RATIO  SCATTER_RATIO  \
0                    72                       10          162.0   
1                    57                        9          149.0   
2                    66                       10          207.0   
3                    63                        9          144.0   
4                   103                       52          149.0   

   ELONGATEDNESS  PR.AXIS_RECTANGULARITY  MAX.LENGTH_RECTANGULARITY  \
0             42                      20                        159   
1             45                      19



Task ID: 219
   date day    period  nswprice  nswdemand  vicprice  vicdemand  transfer
0   0.0   2  0.000000  0.056443   0.439155  0.003467   0.422915  0.414912
1   0.0   2  0.021277  0.051699   0.415055  0.003467   0.422915  0.414912
2   0.0   2  0.042553  0.051489   0.385004  0.003467   0.422915  0.414912
3   0.0   2  0.063830  0.045485   0.314639  0.003467   0.422915  0.414912
4   0.0   2  0.085106  0.042482   0.251116  0.003467   0.422915  0.414912




Task ID: 2074
      Aattr     Battr     Cattr     Dattr     Eattr     Fattr    A1attr  \
0  0.117596  1.241362  1.184036  0.815302 -0.158561  1.256483  1.193546   
1 -1.205362 -1.249654 -0.077532  0.444886 -0.895959 -0.447579 -0.786760   
2  0.779075  0.148811  0.042617 -0.243030  0.800057  0.164136  0.053370   
3  1.146564  0.585831  0.342991  0.021553  0.947536  0.601074  0.353416   
4 -0.764376 -1.162250 -0.137607  0.180303 -0.969698 -1.146681 -0.126658   

     B2attr    C3attr    D4attr  ...   C21attr   D22attr   E23attr   F24attr  \
0  0.818486 -0.141965  0.879481  ...  1.251179  0.807707 -0.069968  1.219160   
1 -0.554203 -0.364672  0.092157  ... -0.614884 -0.192752 -0.736996 -0.969292   
2 -0.448612  0.154978 -0.345245  ... -0.915862 -0.877277  0.671174 -0.006373   
3  0.026550  1.788164  1.010702  ...  0.528832  0.281150  1.412317  1.044084   
4  0.184937 -0.735851 -1.132569  ... -0.795470 -0.192752 -0.885225 -1.231906   

    A25attr   B26attr   C27attr   D28attr   E29attr   



Task ID: 3021
    age sex on_thyroxine query_on_thyroxine on_antithyroid_medication sick  \
0  41.0   F            f                  f                         f    f   
1  23.0   F            f                  f                         f    f   
2  46.0   M            f                  f                         f    f   
3  70.0   F            t                  f                         f    f   
4  70.0   F            f                  f                         f    f   

  pregnant thyroid_surgery I131_treatment query_hypothyroid  ...   T3  \
0        f               f              f                 f  ...  2.5   
1        f               f              f                 f  ...  2.0   
2        f               f              f                 f  ...  NaN   
3        f               f              f                 f  ...  1.9   
4        f               f              f                 f  ...  1.2   

  TT4_measured    TT4 T4U_measured   T4U FTI_measured    FTI  TBG_measured  \




Task ID: 3481
       f1      f2      f3      f4      f5      f6      f7      f8      f9  \
0 -0.4394 -0.0930  0.1718  0.4620  0.6226  0.4704  0.3578  0.0478 -0.1184   
1 -0.4348 -0.1198  0.2474  0.4036  0.5026  0.6328  0.4948  0.0338 -0.0520   
2 -0.2330  0.2124  0.5014  0.5222 -0.3422 -0.5840 -0.7168 -0.6342 -0.8614   
3 -0.3808 -0.0096  0.2602  0.2554 -0.4290 -0.6746 -0.6868 -0.6650 -0.8410   
4 -0.3412  0.0946  0.6082  0.6216 -0.1622 -0.3784 -0.4324 -0.4358 -0.4966   

      f10  ...    f608    f609    f610    f611    f612    f613    f614  \
0 -0.2310  ...  0.3334  0.4102  0.2052  0.3846  0.3590  0.5898  0.3334   
1 -0.1302  ...  0.2272  0.0000  0.2954  0.2046  0.4772  0.0454  0.2046   
2 -0.8318  ...  0.0952 -0.1112 -0.0476 -0.1746  0.0318 -0.0476  0.1112   
3 -0.9614  ...  0.0648 -0.0504 -0.0360 -0.1224  0.1366  0.2950  0.0792   
4 -0.5406  ...  0.2812  0.1562  0.3124  0.2500 -0.0938  0.1562  0.3124   

     f615    f616    f617  
0  0.6410  0.5898 -0.4872  
1  0.4318  0.4546 -0.0



Task ID: 3560
  DMFT.Begin DMFT.End  Gender Ethnic
0          6        3    Male  Black
1          2        1  Female  Black
2          1        0    Male  Black
3          7        2    Male  White
4          3        3  Female  White




Task ID: 3573
   pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  pixel9  \
0       0       0       0       0       0       0       0       0       0   
1       0       0       0       0       0       0       0       0       0   
2       0       0       0       0       0       0       0       0       0   
3       0       0       0       0       0       0       0       0       0   
4       0       0       0       0       0       0       0       0       0   

   pixel10  ...  pixel775  pixel776  pixel777  pixel778  pixel779  pixel780  \
0        0  ...         0         0         0         0         0         0   
1        0  ...         0         0         0         0         0         0   
2        0  ...         0         0         0         0         0         0   
3        0  ...         0         0         0         0         0         0   
4        0  ...         0         0         0         0         0         0   

   pixel781  pixel782  pixel783  pixel784  
0   



Task ID: 3903
   LOC_BLANK  BRANCH_COUNT  CALL_PAIRS  LOC_CODE_AND_COMMENT  LOC_COMMENTS  \
0          2           1.0           0                     0             0   
1          1           1.0           4                     0             0   
2         27          19.0           1                     4            13   
3          2          17.0           2                     0             0   
4          6           1.0           1                     0             2   

   CONDITION_COUNT  CYCLOMATIC_COMPLEXITY  CYCLOMATIC_DENSITY  DECISION_COUNT  \
0              0.0                    1.0                0.10             0.0   
1              0.0                    1.0                0.07             0.0   
2             26.0                   11.0                0.26            12.0   
3             24.0                    9.0                0.47             8.0   
4              0.0                    1.0                0.11             0.0   

   DECISION_DENSITY  ...  MULT



Task ID: 3904
     loc  v(g)  ev(g)  iv(g)      n        v     l      d       i         e  \
0    1.1   1.4    1.4    1.4    1.3     1.30  1.30   1.30    1.30      1.30   
1    1.0   1.0    1.0    1.0    1.0     1.00  1.00   1.00    1.00      1.00   
2   72.0   7.0    1.0    6.0  198.0  1134.13  0.05  20.31   55.85  23029.10   
3  190.0   3.0    1.0    3.0  600.0  4348.76  0.06  17.06  254.87  74202.67   
4   37.0   4.0    1.0    4.0  126.0   599.12  0.06  17.19   34.86  10297.30   

   ...        t  lOCode  lOComment  lOBlank  locCodeAndComment  uniq_Op  \
0  ...     1.30     2.0        2.0      2.0                  2      1.2   
1  ...     1.00     1.0        1.0      1.0                  1      1.0   
2  ...  1279.39    51.0       10.0      8.0                  1     17.0   
3  ...  4122.37   129.0       29.0     28.0                  2     17.0   
4  ...   572.07    28.0        1.0      6.0                  0     11.0   

   uniq_Opnd  total_Op  total_Opnd  branchCount  
0        1



Task ID: 3917
    loc  v(g)  ev(g)  iv(g)      n       v     l      d      i         e  ...  \
0   1.1   1.4    1.4    1.4    1.3    1.30  1.30   1.30   1.30      1.30  ...   
1   1.0   1.0    1.0    1.0    1.0    1.00  1.00   1.00   1.00      1.00  ...   
2  83.0  11.0    1.0   11.0  171.0  927.89  0.04  23.04  40.27  21378.61  ...   
3  46.0   8.0    6.0    8.0  141.0  769.78  0.07  14.86  51.81  11436.73  ...   
4  25.0   3.0    1.0    3.0   58.0  254.75  0.11   9.35  27.25   2381.95  ...   

         t  lOCode  lOComment  lOBlank  locCodeAndComment  uniq_Op  uniq_Opnd  \
0     1.30     2.0          2        2                  2      1.2        1.2   
1     1.00     1.0          1        1                  1      1.0        1.0   
2  1187.70    65.0         10        6                  0     18.0       25.0   
3   635.37    37.0          2        5                  0     16.0       28.0   
4   132.33    21.0          0        2                  0     11.0       10.0   

   total_Op 



In [8]:
# Using ``@`` in `pd.DataFrame.query <
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_
# accesses variables outside of the current dataframe.
tasks = tasks.query("tid in @suite.tasks")

In [9]:
tids = list(tasks[np.logical_and(np.logical_and((tasks.NumberOfInstances <= 2000), (tasks.NumberOfFeatures <= 100))
                                 , (tasks.NumberOfClasses <= 10))].tid)

In [10]:
len(tids)

30

In [11]:
tids = list(tasks[tasks.NumberOfInstances <= 2000].tid)

In [12]:
open_cc_dids = [openml.tasks.get_task(task_id).get_dataset().id for task_id in tids]



In [23]:
open_ml_datasets, open_ml_datasets_df = load_openml_list(test_dids_classification, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 100000, num_feats=100, return_capped=True)


Number of datasets: 36
Loading kr-vs-kp 3 ..
Loading mfeat-factors 12 ..
Loading credit-g 31 ..
Loading vehicle 54 ..
Loading wine 973 ..
Loading kc1 1067 ..
Loading KDDCup09_appetency 1111 ..
Loading airlines 1169 ..
Loading bank-marketing 1461 ..
Loading blood-transfusion-service-center 1464 ..
Loading cnae-9 1468 ..
Loading nomao 1486 ..
Loading phoneme 1489 ..
Loading adult 1590 ..
Loading covertype 1596 ..
Loading numerai28.6 23517 ..
Loading connect-4 40668 ..
Loading car 40975 ..
Loading Australian 40981 ..
Loading segment 40984 ..
Loading jungle_chess_2pcs_raw_endgame_complete 41027 ..
Loading APSFailure 41138 ..
Loading christine 41142 ..
Loading jasmine 41143 ..
Loading sylvine 41146 ..
Loading albert 41147 ..
Loading MiniBooNE 41150 ..
Loading guillermo 41159 ..
Loading riccardo 41161 ..
Loading dilbert 41163 ..
Loading fabert 41164 ..
Loading robert 41165 ..
Loading volkert 41166 ..
Loading dionis 41167 ..
Loading jannis 41168 ..
Loading helena 41169 ..


In [24]:
open_ml_datasets_df = open_ml_datasets_df[open_ml_datasets_df.NumberOfInstances > 10000]

In [25]:
print_table = open_ml_datasets_df
print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()
print_table['id'] = print_table.index
print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)
print_table = print_table.rename(columns=renamer)
print(print_table.to_latex(index=False))

\begin{tabular}{lrrrrrrr}
\toprule
                                  Name &  \# Features &  \# Categorical Features &  \# Instances &  \# Classes &  \# NaNs &  Minority Class Size &    id \\
\midrule
                    KDDCup09\_appetency &         231 &                      39 &        50000 &          2 & 8024152 &                  890 &  1111 \\
                              airlines &           8 &                       5 &       539383 &          2 &       0 &               240264 &  1169 \\
                        bank-marketing &          17 &                      10 &        45211 &          2 &       0 &                 5289 &  1461 \\
                                 nomao &         119 &                      30 &        34465 &          2 &       0 &                 9844 &  1486 \\
                                 adult &          15 &                       9 &        48842 &          2 &    6465 &                11687 &  1590 \\
                             covertype &    

### Prepare Validation datasets

In [26]:
open_cc_datasets, open_cc_datasets_df = load_openml_list(open_cc_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 2000, num_feats=100, return_capped=True)

def extend_datasets(datasets, filtering = False):
    extended_datasets = {}
    i = 0
    for d in tqdm(datasets):
        if ((not 'NumberOfFeatures' in datasets[d])
                or (not 'NumberOfClasses' in datasets[d])
                or (not 'NumberOfInstances' in datasets[d])
                # or datasets[d]['NumberOfFeatures'] >= num_feats
                or datasets[d]['NumberOfClasses'] <= 0):
            print(datasets[d])
            continue
        ds = openml.datasets.get_dataset(d, download_data=False)
        if filtering and (datasets[d]['NumberOfInstances'] < 150
                          or datasets[d]['NumberOfInstances'] > 2000
                         or datasets[d]['NumberOfFeatures'] > 100
                         or datasets[d]['NumberOfClasses'] > 10):
            continue
        extended_datasets[d] = datasets[d]
        extended_datasets[d].update(ds.qualities)
    
    return extended_datasets

# All datasets
openml_list = openml.datasets.list_datasets()
openml_list = pd.DataFrame.from_dict(openml_list, orient="index")

# Select only classification
openml_list = openml_list[~openml_list['MajorityClassSize'].isna()]

# Remove duplicated datasets
duplicated = openml_list.duplicated(subset=['MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize',
       'NumberOfClasses', 'NumberOfFeatures', 'NumberOfInstances',
       'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',
       'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures'], keep='first')
openml_list = openml_list[~duplicated]

duplicated = openml_list.duplicated(subset=['name'], keep='first')
openml_list = openml_list[~duplicated]

# Filter out datasets that don't have meta information or Don't fulfill other criteria
openml_list = openml_list.to_dict(orient='index')
openml_list = pd.DataFrame.from_dict(extend_datasets(openml_list, filtering=True), orient="index")

# Filter out datasets in Open CC
openml_list = openml_list[~openml_list.name.apply(lambda x: x in test_datasets_multiclass_df.name.values)]
openml_list['CFI'] = openml_list.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)
test_datasets_multiclass_df['CFI'] = test_datasets_multiclass_df.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)
openml_list = openml_list[~openml_list.CFI.apply(lambda x: x in test_datasets_multiclass_df.CFI.values)]

# Remove time series and artificial data
openml_list = openml_list[~openml_list.name.apply(lambda x: 'autoUniv' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'fri_' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'FOREX' in x)]

# Remove datasets that overlapped with Open CC closely by name
openml_list = openml_list[~openml_list.name.apply(lambda x: 'ilpd' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'car' in x)]
openml_list = openml_list[~openml_list.name.apply(lambda x: 'pc1' in x)]

# Remove datasets that didn't load
openml_list = openml_list[~openml_list.did.apply(lambda x: x in {1065, 40589, 41496, 770, 43097, 43148, 43255, 43595, 43786, 41701})]

# Remove class skew
openml_list = openml_list[(openml_list.MinorityClassSize / openml_list.MajorityClassSize) > 0.05]
openml_list = openml_list[openml_list.AutoCorrelation != 1]

# Remove too easy
openml_list = openml_list[openml_list.CfsSubsetEval_DecisionStumpAUC != 1]

Number of datasets: 30
Loading balance-scale 11 ..
Loading mfeat-fourier 14 ..
Loading breast-w 15 ..
Loading mfeat-karhunen 16 ..
Loading mfeat-morphological 18 ..
Loading mfeat-zernike 22 ..
Loading cmc 23 ..
Loading credit-approval 29 ..
Loading credit-g 31 ..
Loading diabetes 37 ..
Loading tic-tac-toe 50 ..
Loading vehicle 54 ..
Loading eucalyptus 188 ..
Loading analcatdata_authorship 458 ..
Loading analcatdata_dmft 469 ..
Loading pc4 1049 ..
Loading pc3 1050 ..
Loading kc2 1063 ..
Loading pc1 1068 ..
Loading banknote-authentication 1462 ..
Loading blood-transfusion-service-center 1464 ..
Loading ilpd 1480 ..
Loading qsar-biodeg 1494 ..
Loading wdbc 1510 ..
Loading cylinder-bands 6332 ..
Loading dresses-sales 23381 ..
Loading MiceProtein 40966 ..
Loading car 40975 ..
Loading steel-plates-fault 40982 ..
Loading climate-model-simulation-crashes 40994 ..


100%|██████████| 1459/1459 [00:09<00:00, 147.57it/s]


NameError: name 'test_datasets_multiclass_df' is not defined

In [None]:
print_table = openml_list
print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()
print_table['id'] = print_table.index
print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)
print_table = print_table.rename(columns=renamer)
print(print_table.to_latex(index=False))