In [35]:
import numpy as np
import pandas as pd
import sklearn.ensemble
import sklearn.model_selection
import sklearn.metrics
import sklearn.pipeline
import sklearn.preprocessing
import openml

In [7]:
datasets = openml.datasets.list_datasets(tag='OpenML100')

In [40]:
max_score_per_dataset = {}
for dataset_id in datasets:
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y = dataset.get_data(target=dataset.default_target_attribute)
    n_features = X.shape[1]
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)
    scores = []
    for feat_idx in range(n_features):
        X_train_ = X_train[:, feat_idx].reshape((-1, 1))
        X_test_ = X_test[:, feat_idx].reshape((-1, 1))
        imp = sklearn.preprocessing.Imputer(strategy='median')
        X_train_ = imp.fit_transform(X_train_)
        X_test_ = imp.transform(X_test_)
        if X_train_.shape[1] == 0:
            continue
        rf = sklearn.ensemble.RandomForestClassifier()
        rf.fit(X_train_, y_train)
        y_hat = rf.predict(X_test_)
        scores.append(sklearn.metrics.accuracy_score(y_test, y_hat))
    
    max_score_per_dataset[dataset_id] = {
        'score': np.max(scores), 
        'argmax': np.argmax(scores),
        'name': dataset.name
    }
    
results = pd.DataFrame(max_score_per_dataset).transpose()

In [43]:
results.sort_values(by='score')

Unnamed: 0,argmax,name,score
1493,19,one-hundred-plants-texture,0.0475
1491,42,one-hundred-plants-margin,0.065
1492,55,one-hundred-plants-shape,0.085
6,10,letter,0.1668
1468,704,cnae-9,0.181481
1515,694,micro-mass,0.181818
300,583,isolet,0.19641
1501,156,semeion,0.203008
469,3,analcatdata_dmft,0.205
22,5,mfeat-zernike,0.226
