In [1]:
import numpy as np
import pandas as pd
import sklearn.ensemble
import sklearn.model_selection
import sklearn.metrics
import sklearn.pipeline
import sklearn.preprocessing
import openml

In [2]:
datasets = openml.datasets.list_datasets(tag='OpenML-CC18')

In [3]:
max_score_per_dataset = {}
for dataset_id in datasets:
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y = dataset.get_data(target=dataset.default_target_attribute)
    n_features = X.shape[1]
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)
    scores = []
    for feat_idx in range(n_features):
        X_train_ = X_train[:, feat_idx].reshape((-1, 1))
        X_test_ = X_test[:, feat_idx].reshape((-1, 1))
        imp = sklearn.preprocessing.Imputer(strategy='median')
        X_train_ = imp.fit_transform(X_train_)
        X_test_ = imp.transform(X_test_)
        if X_train_.shape[1] == 0:
            continue
        rf = sklearn.ensemble.RandomForestClassifier()
        rf.fit(X_train_, y_train)
        y_hat = rf.predict(X_test_)
        current_score = sklearn.metrics.accuracy_score(y_test, y_hat)
        scores.append(current_score)
        if current_score == 1.00:
            openml.utils._tag_entity('dataset', dataset_id, 'single_feature_predictable')
    
    max_score_per_dataset[dataset_id] = {
        'score': np.max(scores), 
        'argmax': np.argmax(scores),
        'name': dataset.name
    }
    
results = pd.DataFrame(max_score_per_dataset).transpose()

In [4]:
results.sort_values(by='score')

Unnamed: 0,argmax,name,score
1493,5,one-hundred-plants-texture,0.04
40923,911,Devnagari-Script,0.0513043
1491,12,one-hundred-plants-margin,0.0525
1492,46,one-hundred-plants-shape,0.075
40971,9,collins,0.112
40927,2049,CIFAR_10,0.174867
6,12,letter,0.1756
1468,545,cnae-9,0.192593
22,28,mfeat-zernike,0.196
300,583,isolet,0.196923
