# Going through OpenML Basic Usage
https://openml.github.io/openml-python/stable/usage.html

In [9]:
import openml

In [None]:
# Either put your API key below or in the ~/.openml/config file as
# specified on the OpenML Basic Usage page
# apikey = '<your API key>'
# openml.config.apikey = apikey

In [10]:
import pandas as pd
datasets = openml.datasets.list_datasets()

# NOTE: the line below is different from the Basic Usage 
datasets_df = pd.DataFrame.from_dict(datasets, orient='index')

datasets_df.set_index('did', inplace=True)

## Examining and filtering by metadata

In [119]:
print('Number of datasets:', datasets_df.shape[0])
print(datasets_df.columns)

Number of datasets: 19525
Index(['NumberOfClasses', 'NumberOfNumericFeatures', 'MinorityClassSize',
       'status', 'MajorityClassSize', 'NumberOfMissingValues',
       'NumberOfInstances', 'NumberOfInstancesWithMissingValues',
       'NumberOfSymbolicFeatures', 'NumberOfFeatures', 'name', 'format',
       'MaxNominalAttDistinctValues'],
      dtype='object')


In [93]:
print(datasets_df.iloc[0])

NumberOfClasses                            6
NumberOfNumericFeatures                    6
MinorityClassSize                          0
status                                active
MajorityClassSize                        684
NumberOfMissingValues                      0
NumberOfInstances                        898
NumberOfInstancesWithMissingValues         0
NumberOfSymbolicFeatures                  33
NumberOfFeatures                          39
name                                  anneal
format                                  ARFF
MaxNominalAttDistinctValues               10
Name: 1, dtype: object


In [72]:
print('Number of datasets without any missing values:', \
      datasets_df[datasets_df.NumberOfInstancesWithMissingValues == 0]
      .shape[0])

Number of datasets without any missing values: 19326


In [71]:
print('Number of datasets with numeric features:', \
      datasets_df[(datasets_df.NumberOfNumericFeatures > 0) & 
                  (datasets_df.NumberOfInstancesWithMissingValues == 0)]
      .shape[0])

Number of datasets with numeric features: 19209


In [80]:
print('Number of datasets with numeric features and no symbolic features:', \
      datasets_df[(datasets_df.NumberOfSymbolicFeatures == 0) &
                  (datasets_df.NumberOfNumericFeatures > 0) & 
                  (datasets_df.NumberOfInstancesWithMissingValues == 0)]
      .shape[0])

Number of datasets with numeric features and no symbolic features: 283


## Downloading datasets

In [120]:
# NOTE: Not all dataset_ids seem to work. For example, ids 0 and 1 process with error.
dataset_id = 23
dataset = openml.datasets.get_dataset(23)

In [124]:
# get just the unlabeled data
X = dataset.get_data()
print('X is simply a numpy array:', type(X))
print('Data shape and type:', X.shape, X.dtype)

X is simply a numpy array: <class 'numpy.ndarray'>
Data shape and type: (1473, 10) float32


In [116]:
# also get the attribute names
X, names = dataset.get_data(return_attribute_names=True)
print(names)

['Wifes_age', 'Wifes_education', 'Husbands_education', 'Number_of_children_ever_born', 'Wifes_religion', 'Wifes_now_working%3F', 'Husbands_occupation', 'Standard-of-living_index', 'Media_exposure', 'Contraceptive_method_used']


In [None]:
# also get the data labels and which attributes are categorical
# NOTE: look into other flags for get_data() (can't find in API)
X, y, categorical = dataset.get_data(
    target=dataset.default_target_attribute,
    return_categorical_indicator=True)

## Doing the same stuff with Tasks

In [128]:
# openml doesn't seem to actually support this
tasks = openml.tasks.list_tasks_by_type(1)

AttributeError: 'module' object has no attribute 'list_tasks_by_type'

In [153]:
tasks = openml.tasks.list_tasks()
tasks_df = pd.DataFrame.from_dict(datasets, orient='index')

In [146]:
tasks_df.iloc[0]

NumberOfClasses                            6
NumberOfNumericFeatures                    6
MinorityClassSize                          0
did                                        1
status                                active
MajorityClassSize                        684
NumberOfMissingValues                      0
NumberOfInstances                        898
NumberOfInstancesWithMissingValues         0
NumberOfSymbolicFeatures                  33
NumberOfFeatures                          39
name                                  anneal
format                                  ARFF
MaxNominalAttDistinctValues               10
Name: 1, dtype: object

In [147]:
# no 'tid' attribute like the website says
tasks_df.set_index('tid', inplace=True)

KeyError: 'tid'

In [154]:
# However, there is a 'did' attribute like in the datasets
tasks_df.set_index('did', inplace=True)

It seems we can do the same stuff with tasks that we did with datasets, but I don't think tasks are sufficiently supported at this point to use them over datasets.