In [1]:
%matplotlib inline
import openml as oml
import seaborn as sns
import numpy as np
import pandas as pd
import sys
import math
from scipy.stats import norm
from matplotlib import pyplot
import sklearn.tree
import sklearn.ensemble
import sklearn.preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Step 1: Get datasets from OpenML
- Only classification datasets
- Only active (verified) datasets

In [2]:
# Get all OpenML datasets
openml_list = oml.datasets.list_datasets() # Returns a dict
datalist = pd.DataFrame.from_dict(openml_list, orient='index') # Transform to pandas
datalist = datalist[datalist.status == 'active'] # Only use active (verified) datasets
datalist = datalist[datalist.NumberOfClasses>=2] # Only classification
print("{} active classification datasets".format(len(datalist)))

1016 active classification datasets


In [3]:
# Bookkeeping
data_names = {k: v for (k, v) in datalist[['did','name']].values} # dataset names
data_status = {k: 'OK' for k in datalist.index} # dataset status (OK or reason for removal)
datalist_full = datalist.copy()

## Step 1: Apply simple preconditions
- Number of observations larger than 500 (meaningful evaluations)
- Number of observations smaller than 100000 (keep runtime manageable)
- Number of features does not exceed 5000 (keep runtime manageable)
- The ratio of the minority class and the majority class is > 0.05 (severely imbalanced datasets complicate analysis)
- Number of values for categorical features must not exceed 100 (severely slows down some algorithms)
- Sparsely formatted data (requires special data readers)

In [4]:
# Apply preconditions
data_status.update({k: 'Too small' for k in datalist.index[datalist.NumberOfInstances<500]})
data_status.update({k: 'Too large' for k in datalist.index[datalist.NumberOfInstances>100000]})
data_status.update({k: 'High-dimensional' for k in datalist.index[datalist.NumberOfFeatures>5000]})
data_status.update({k: 'Extreme imbalance' for k in datalist.index[datalist.MinorityClassSize / datalist.MajorityClassSize < 0.05]})
data_status.update({k: 'Too many categories' for k in datalist.index[datalist.MaxNominalAttDistinctValues > 100]})
data_status.update({k: 'Sparse format' for k in datalist.index[datalist.format == 'Sparse_ARFF']})


# Filter dataset list
datalist = datalist[pd.Series({k:(v=='OK') for k,v in data_status.items()})] 

# Status update
[[x,list(data_status.values()).count(x)] for x in set(data_status.values())]

[['Too many categories', 22],
 ['Too small', 335],
 ['Too large', 102],
 ['Sparse format', 32],
 ['High-dimensional', 60],
 ['Extreme imbalance', 176],
 ['OK', 289]]

## Step 2: Filter out special datasets
- Artificial datasets (may bias the results)
- Time series dataset (cannot use random sampling for evaluation)
- Text data (contains string features which need additional preprocessing)
- Multilabel data (multiple targets need to be predicted)
- Derived versions of datasets (with additional preprocessing)
- Datasets where the intended classification target is unclear
- Binarized regression problems
- Unknown origin (no description how data was collected and what the problem is)
- Grouped data (instances form groups (blocks) and can't be randomly sampled)

In [5]:
# Get lists of special datasets
artificial_set = set(oml.datasets.list_datasets(tag="artificial").keys()) # Artificial datasets
timeseries_set = set(oml.datasets.list_datasets(tag="time_series").keys()) # Time series datasets
text_set = set(oml.datasets.list_datasets(tag="text_data").keys()) # Text dataset (contains string features)
multilabel_set = set(oml.datasets.list_datasets(tag="multi_label").keys()) # Multi-label datasets
derived_set = set(oml.datasets.list_datasets(tag="derived").keys()) # Derived datasets
unspecified_set = set(oml.datasets.list_datasets(tag="unspecified_target_feature").keys()) # Unspecified target
binarized_set = set(oml.datasets.list_datasets(tag="binarized_regression_problem").keys()) # Binarized data
unknown_set = set(oml.datasets.list_datasets(tag="origin_unknown").keys())
grouped_set = set(oml.datasets.list_datasets(tag="grouped_data").keys())

data_status.update({k: 'Artificial data' for k in artificial_set})
data_status.update({k: 'Time series data' for k in timeseries_set})
data_status.update({k: 'Text data' for k in text_set})
data_status.update({k: 'Multi-label data' for k in multilabel_set})
data_status.update({k: 'Derived (non-original) data' for k in derived_set})
data_status.update({k: 'Unspecified target feature' for k in unspecified_set})
data_status.update({k: 'Binarized regression problem' for k in binarized_set})
data_status.update({k: 'Unknown origin' for k in unknown_set})
data_status.update({k: 'Grouped data' for k in grouped_set})
#data_status.update({k: 'OpenML100' for k in openml100_set})

# Filter dataset list
datalist = datalist[pd.Series({k:(v=='OK') for k,v in data_status.items()})] 

# Status update
[[x,list(data_status.values()).count(x)] for x in set(data_status.values())]

[['Too many categories', 12],
 ['Artificial data', 196],
 ['Too small', 334],
 ['Unknown origin', 7],
 ['Too large', 25],
 ['Unspecified target feature', 6],
 ['Binarized regression problem', 84],
 ['Derived (non-original) data', 17],
 ['Sparse format', 32],
 ['Multi-label data', 5],
 ['Grouped data', 1],
 ['Time series data', 4],
 ['Extreme imbalance', 112],
 ['High-dimensional', 60],
 ['Text data', 1],
 ['OK', 143]]

## Step 3: Remove alternative versions of datasets
- Remove binarized versions of multi-class datasets
- Check other possible duplicates

In [6]:
# Sorting makes things easier
# We need the full list because there may be binarized versions of already removed datasets
datalist_full = datalist_full.sort_values(by=['name','NumberOfClasses'], ascending=[True, False])

data_unique = {}
for index, row in datalist_full.iterrows():
    if row['name'] not in data_unique:
        data_unique[row['name']] = row
    else:
        previous = data_unique[row['name']]
        if previous['NumberOfClasses'] > 2 and row['NumberOfClasses'] == 2:
            data_status[row['did']] = 'Binarized version of multiclass dataset'
        elif data_status[row['did']] == 'OK':
            data_status[row['did']] = 'Possible duplicate'

# Filter dataset list
datalist = datalist[pd.Series({k:(v=='OK') for k,v in data_status.items()})] 
               
# Status update
[[x,list(data_status.values()).count(x)] for x in set(data_status.values())]

[['Too many categories', 11],
 ['Artificial data', 195],
 ['Too small', 294],
 ['Unknown origin', 7],
 ['Unspecified target feature', 6],
 ['Too large', 25],
 ['Binarized regression problem', 82],
 ['Derived (non-original) data', 16],
 ['Sparse format', 31],
 ['Multi-label data', 5],
 ['Binarized version of multiclass dataset', 81],
 ['Possible duplicate', 9],
 ['Extreme imbalance', 111],
 ['Grouped data', 1],
 ['High-dimensional', 60],
 ['Time series data', 4],
 ['Text data', 1],
 ['OK', 100]]

In [7]:
# These need to be checked
[k for k,v in data_status.items() if v=='Possible duplicate']

[40979, 40984, 40994, 948, 1220, 1568, 1590, 40595, 40983]

## Step 3: Remove trivial datasets
- See if a model (e.g. random forest) based on 1 feature can get perfect CV performance

In [11]:
datasets = [k for k,v in data_status.items() if v=='OK']

max_score_per_dataset = {}
for dataset_id in datasets:
    dataset = oml.datasets.get_dataset(dataset_id)
    X, y = dataset.get_data(target=dataset.default_target_attribute)
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
    n_features = X.shape[1]
    scores = []
    for feat_idx in range(n_features):
        try:
            X1 = X[:, feat_idx].reshape((-1, 1))
            clf = make_pipeline(sklearn.preprocessing.Imputer(strategy='median'), sklearn.ensemble.RandomForestClassifier())
            scores.append(cross_val_score(clf, X1, y, cv=cv))
        except ValueError:
            continue
    
    max_score_per_dataset[dataset_id] = {
        'score': np.max(scores), 
        'argmax': np.argmax(scores),
        'name': dataset.name
    }
    if max_score_per_dataset[dataset_id]["score"] > 0.99:
        data_status[dataset_id] = 'Too easy'
        print("Dataset ", dataset.name, "is too easy.")
    
results = pd.DataFrame(max_score_per_dataset).transpose()

Dataset  mushroom is too easy.
Dataset  irish is too easy.
Dataset  cjs is too easy.


In [14]:
results.sort_values(by='score')

Unnamed: 0,argmax,name,score
40923,7076,Devnagari-Script,0.0538043
1493,51,one-hundred-plants-texture,0.075
1491,422,one-hundred-plants-margin,0.0875
1492,163,one-hundred-plants-shape,0.1125
40971,95,collins,0.14
6,139,letter,0.1805
40927,21092,CIFAR_10,0.1855
1501,64,semeion,0.2
300,5834,isolet,0.207692
1468,3863,cnae-9,0.222222


### Results
Final list of selected datasets:

In [15]:
final_datasets = [k for k,v in data_status.items() if v=='OK']
print('{} datasets selected'.format(len(final_datasets)))
{k:v for k,v in data_names.items() if k in final_datasets}

97 datasets selected


{3: 'kr-vs-kp',
 6: 'letter',
 11: 'balance-scale',
 12: 'mfeat-factors',
 14: 'mfeat-fourier',
 15: 'breast-w',
 16: 'mfeat-karhunen',
 18: 'mfeat-morphological',
 20: 'mfeat-pixel',
 22: 'mfeat-zernike',
 23: 'cmc',
 28: 'optdigits',
 29: 'credit-approval',
 31: 'credit-g',
 32: 'pendigits',
 36: 'segment',
 37: 'diabetes',
 38: 'sick',
 42: 'soybean',
 44: 'spambase',
 46: 'splice',
 50: 'tic-tac-toe',
 54: 'vehicle',
 60: 'waveform-5000',
 151: 'electricity',
 182: 'satimage',
 188: 'eucalyptus',
 300: 'isolet',
 307: 'vowel',
 375: 'JapaneseVowels',
 377: 'synthetic_control',
 458: 'analcatdata_authorship',
 469: 'analcatdata_dmft',
 470: 'profb',
 554: 'mnist_784',
 1036: 'sylva_agnostic',
 1038: 'gina_agnostic',
 1043: 'ada_agnostic',
 1046: 'mozilla4',
 1049: 'pc4',
 1050: 'pc3',
 1053: 'jm1',
 1063: 'kc2',
 1067: 'kc1',
 1068: 'pc1',
 1120: 'MagicTelescope',
 1176: 'Internet-Advertisements',
 1461: 'bank-marketing',
 1462: 'banknote-authentication',
 1464: 'blood-transfusion-s

Passed all tests, but not in OpenML100:

In [16]:
openml100_set = set(oml.datasets.list_datasets(tag="OpenML100").keys()) # OpenML100

new_datasets = [k for k,v in data_status.items() if v=='OK' and k not in openml100_set]
{k:v for k,v in data_names.items() if k in new_datasets}

{23517: 'numerai28.6',
 40645: 'GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1',
 40646: 'GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1',
 40647: 'GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1',
 40648: 'GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1',
 40649: 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM-2_001',
 40650: 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM-2_001',
 40670: 'dna',
 40687: 'solar-flare',
 40701: 'churn',
 40705: 'tokyo1',
 40922: 'Run_or_walk_information',
 40923: 'Devnagari-Script',
 40927: 'CIFAR_10',
 40966: 'MiceProtein',
 40971: 'collins',
 40982: 'steel-plates-fault',
 40996: 'Fashion-MNIST'}

Datasets in OpenML100 that did not pass all tests:

In [None]:
new_datasets = [k for k,v in data_status.items() if k in openml100_set and v!='OK']
{k:v for k,v in data_names.items() if k in new_datasets}