# Going through OpenML Basic Usage
https://openml.github.io/openml-python/stable/usage.html

In [36]:
import openml
import pandas as pd

In [11]:
# Either put your API key below or in the ~/.openml/config file as
# specified on the OpenML Basic Usage page
# apikey = '<your API key>'
# openml.config.apikey = apikey

In [12]:
import pandas as pd
datasets = openml.datasets.list_datasets()

# NOTE: the line below is different from the Basic Usage 
datasets_df = pd.DataFrame.from_dict(datasets, orient='index')

datasets_df.set_index('did', inplace=True)

## Examining and filtering by metadata

In [13]:
print('Number of datasets:', datasets_df.shape[0])
print(datasets_df.columns)

Number of datasets: 19526
Index(['NumberOfMissingValues', 'name', 'NumberOfClasses',
       'NumberOfInstancesWithMissingValues', 'NumberOfInstances',
       'MajorityClassSize', 'NumberOfSymbolicFeatures', 'format',
       'NumberOfFeatures', 'status', 'MinorityClassSize',
       'NumberOfNumericFeatures', 'MaxNominalAttDistinctValues'],
      dtype='object')


In [14]:
print(datasets_df.iloc[0])

NumberOfMissingValues                      0
name                                  anneal
NumberOfClasses                            6
NumberOfInstancesWithMissingValues         0
NumberOfInstances                        898
MajorityClassSize                        684
NumberOfSymbolicFeatures                  33
format                                  ARFF
NumberOfFeatures                          39
status                                active
MinorityClassSize                          0
NumberOfNumericFeatures                    6
MaxNominalAttDistinctValues               10
Name: 1, dtype: object


In [15]:
print('Number of datasets without any missing values:', \
      datasets_df[datasets_df.NumberOfInstancesWithMissingValues == 0]
      .shape[0])

Number of datasets without any missing values: 19327


In [16]:
print('Number of datasets with numeric features:', \
      datasets_df[(datasets_df.NumberOfNumericFeatures > 0) & 
                  (datasets_df.NumberOfInstancesWithMissingValues == 0)]
      .shape[0])

Number of datasets with numeric features: 19210


In [17]:
print('Number of datasets with numeric features and no symbolic features:', \
      datasets_df[(datasets_df.NumberOfSymbolicFeatures == 0) &
                  (datasets_df.NumberOfNumericFeatures > 0) & 
                  (datasets_df.NumberOfInstancesWithMissingValues == 0)]
      .shape[0])

Number of datasets with numeric features and no symbolic features: 283


## Downloading datasets

In [43]:
# NOTE: Not all dataset_ids seem to work. For example, ids 0 and 1 process with error.
dataset_id = 23
dataset = openml.datasets.get_dataset(6357)

BadNominalValue: Data value not found in nominal declaration, at line 50.

In [19]:
# get just the unlabeled data
X = dataset.get_data()
print('X is simply a numpy array:', type(X))
print('Data shape and type:', X.shape, X.dtype)

X is simply a numpy array: <class 'numpy.ndarray'>
Data shape and type: (1473, 10) float32


In [20]:
# also get the attribute names
X, names = dataset.get_data(return_attribute_names=True)
print(names)

['Wifes_age', 'Wifes_education', 'Husbands_education', 'Number_of_children_ever_born', 'Wifes_religion', 'Wifes_now_working%3F', 'Husbands_occupation', 'Standard-of-living_index', 'Media_exposure', 'Contraceptive_method_used']


In [21]:
# also get the data labels and which attributes are categorical
# NOTE: look into other flags for get_data() (can't find in API)
X, y, categorical = dataset.get_data(
    target=dataset.default_target_attribute,
    return_categorical_indicator=True)

## Doing the same stuff with Tasks

In [22]:
# openml doesn't seem to actually support this
tasks = openml.tasks.list_tasks_by_type(1)

AttributeError: 'module' object has no attribute 'list_tasks_by_type'

In [None]:
tasks = openml.tasks.list_tasks()
tasks_df = pd.DataFrame.from_dict(datasets, orient='index')

In [None]:
tasks_df.iloc[0]

In [None]:
# no 'tid' attribute like the website says
tasks_df.set_index('tid', inplace=True)

In [None]:
# However, there is a 'did' attribute like in the datasets
tasks_df.set_index('did', inplace=True)

It seems we can do the same stuff with tasks that we did with datasets, but I don't think tasks are sufficiently supported at this point to use them over datasets.

## Downloading Many Datasets

In [23]:
import numpy as np
np.random.seed(33)

In [24]:
dataset_ids = datasets_df[datasets_df.NumberOfInstancesWithMissingValues == 0].index.values
dataset_ids_sample = np.random.choice(dataset_ids, 20)
dataset_ids_sample[0] = 23
dataset_ids_sample

array([   23, 27003,   823,   128, 31590, 37642,  4020, 39287, 34865,
        3955,  3478,  1160, 23745, 39472, 35886, 39438, 24472, 33527,
         774, 38509])

In [29]:
from openml.exceptions import OpenMLServerError

In [30]:
import pickle

# dataset_ids = dataset_ids_sample
num_datasets = len(dataset_ids)
good_dataset_ids = []
num_failed_datasets = 0

print('Total number of datasets:', num_datasets)
for i, dataset_id in enumerate(dataset_ids):
    print(i, 'of', num_datasets, '...', 'dataset_id:', dataset_id, '...')
    try:
        dataset = openml.datasets.get_dataset(dataset_id)
        good_dataset_ids.append(dataset_id)
        print('Success')
    except OpenMLServerError:
        num_failed_datasets += 1
        print('Failure')
#     if i + 1 % 100 == 0:
#         with open('good_dataset_ids.pickle', 'rb') as handle:
#             past_good_ids = pickle.load(handle)['ids']
            
#         print(len(past_good_ids), 'encountered in the past')
#         print('Additional', len(good_dataset_ids), 'elements being added to file...'
              
#         with open('good_dataset_ids.pickle', 'wb') as handle:
#             pickle.dump(good_dataset_ids, handle)
    
        
# with open('good_dataset_ids.pickle', 'wb') as handle:
#     pickle.dump(good_dataset_ids, handle)        

print('Number of failed datasets:', num_failed_datasets)
print('Number of good datasets:', len(good_dataset_ids))
print('Check partition:', num_failed_datasets + len(good_dataset_ids), '=?', num_datasets)

Total number of datasets: 19327
0 of 19327 ... dataset_id: 1 ...
Failure
1 of 19327 ... dataset_id: 3 ...
Success
2 of 19327 ... dataset_id: 6 ...
Success
3 of 19327 ... dataset_id: 8 ...
Success
4 of 19327 ... dataset_id: 10 ...
Success
5 of 19327 ... dataset_id: 11 ...
Success
6 of 19327 ... dataset_id: 12 ...
Success
7 of 19327 ... dataset_id: 14 ...
Success
8 of 19327 ... dataset_id: 16 ...
Success
9 of 19327 ... dataset_id: 18 ...
Success
10 of 19327 ... dataset_id: 20 ...
Success
11 of 19327 ... dataset_id: 21 ...
Success
12 of 19327 ... dataset_id: 22 ...
Success
13 of 19327 ... dataset_id: 23 ...
Success
14 of 19327 ... dataset_id: 26 ...
Success
15 of 19327 ... dataset_id: 28 ...
Success
16 of 19327 ... dataset_id: 30 ...
Success
17 of 19327 ... dataset_id: 31 ...
Success
18 of 19327 ... dataset_id: 32 ...
Success
19 of 19327 ... dataset_id: 36 ...
Success
20 of 19327 ... dataset_id: 37 ...
Success
21 of 19327 ... dataset_id: 39 ...
Success
22 of 19327 ... dataset_id: 40 ...
S

ChunkedEncodingError: ('Connection broken: IncompleteRead(0 bytes read, 2 more expected)', IncompleteRead(0 bytes read, 2 more expected))

In [None]:
print('Number of failed datasets:', num_failed_datasets)
print('Number of good datasets:', len(good_dataset_ids))

In [None]:
with open('good_dataset_ids.pickle', 'wb') as handle:
    pickle.dump({'i': i, 'ids': good_dataset_ids}, handle)  

In [None]:
with open('good_dataset_ids.pickle', 'rb') as handle:
    test = pickle.load(handle)
print(test)

In [None]:
dataset_metadata = openml.datasets.list_datasets()
metadata_df = pd.DataFrame.from_dict(dataset_metadata, orient='index')
filtered_df = metadata_df[metadata_df.NumberOfInstancesWithMissingValues == 0]

In [31]:
X, y, categorical, names = dataset.get_data(
            target=dataset.default_target_attribute,
            return_categorical_indicator=True,
            return_attribute_names=True)