# Load the specified dataset and store it in the raw data directory.

## Default Values for Papermill Parameters

In [None]:
PARAM_DATASET_NAME = "OpenML Adult"

## Define constants

In [None]:
import os
import numpy as np

from subroc.datasets.metadata import meta_dict, DatasetName, to_DatasetName


STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../..")
CACHE_PATH = os.environ.get("CACHE_PATH", "../cache")

DATA_OUT_PATH = f"{STAGE_OUTPUT_PATH}/data/raw"
    
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

DATASET_NAME = to_DatasetName(PARAM_DATASET_NAME)
DATASET_META = meta_dict[DATASET_NAME]

## Load the dataset

In [None]:
dataset = DATASET_META.loader.load()

## Show the dataset

In [None]:
if DATASET_NAME == DatasetName.OpenML_ADULT:
    print(dataset)

    dataset_df, _, categorical_indicator, attribute_names = dataset.get_data()

    print(dataset_df.describe())
    print(categorical_indicator)
    print(attribute_names)

    print(f"categorical attributes: {np.array(attribute_names)[categorical_indicator]}")
else:
    dataset_df = dataset
    print(dataset_df.describe())

## Save the dataset

In [None]:
out_path = DATA_OUT_PATH + "/" + DATASET_META.dataset_dir
if not os.path.exists(out_path):
    os.mkdir(out_path)

dataset_df.to_csv(out_path + "/" + DATASET_META.raw_filename + ".data", index=False)