This dataset is created using the template for creating a dataset from scratch as in: https://cookiecutter-easydata.readthedocs.io/en/latest/New-Dataset-Template/. 

In [1]:
# Basic utility functions
import logging
import os
import pathlib
from pprint import pprint

from src.log import logger
from src import paths
from src.utils import list_dir
from functools import partial

# data functions
from src.data import DataSource, Dataset, DatasetGraph, Catalog
from src import helpers

2021-10-26 10:12:56,860 - utils - INFO - NumExpr defaulting to 4 threads.


In [2]:
# Optionally set to debug log level
logger.setLevel(logging.DEBUG)

## Create the Datasource

In [3]:
ds_name = 'penguins-raw'
dsrc = DataSource(ds_name)

In [4]:
url = "https://github.com/allisonhorst/palmerpenguins/raw/5b5891f01b52ae26ad8cb9755ec93672f49328a8/data/penguins_size.csv"

In [5]:
filename = 'penguins_size.csv' # path relative to paths['raw_data_path'] for the file

In [6]:
license = """
Data are available by [CC-0](https://github.com/allisonhorst/palmerpenguins) license in accordance with the Palmer Station LTER Data Policy and the LTER Data Access Policy for Type I data.
"""

In [7]:
metadata = """
The goal of palmerpenguins is to provide a great dataset for data exploration & visualization, as an alternative to `iris`.

More information can be found at [https://github.com/allisonhorst/palmerpenguins](https://github.com/allisonhorst/palmerpenguins).

Data were collected and made available by Dr. Kristen Gorman and the Palmer Station, Antarctica LTER, a member of the Long Term Ecological Research Network.

The data consists of measurements of bill (culmen) and flippers and weights of three species of penguins, along with some other metadata about the penguins. In total we have 334 different penguins measured."""

In [8]:
dsrc.add_url(url=url, file_name=filename, unpack_action='copy')
dsrc.add_metadata(contents=metadata, force=True)
dsrc.add_metadata(contents=license, kind='LICENSE', force=True)

In [9]:
from src.data.extra import process_extra_files
process_function = process_extra_files
process_function_kwargs = {'file_glob':'*.csv',
                           'do_copy': True,
                           'extra_dir': ds_name+'.extra',
                           'extract_dir': ds_name}



In [10]:
dsrc.process_function = partial(process_function, **process_function_kwargs)

In [11]:
dsrc.update_catalog()

2021-10-26 10:12:57,419 - catalog - DEBUG - Loaded 2 records from 'datasources' Catalog.
2021-10-26 10:12:57,425 - catalog - DEBUG - Verifying serialization for catalog 'datasources'
2021-10-26 10:12:57,436 - catalog - DEBUG - Writing entry:'penguins-raw' to catalog:'datasources'.
2021-10-26 10:12:57,439 - datasets - DEBUG - Updated datasource:penguins-raw in catalog


## Create the Corresponding Dataset

In [12]:
from src.data import DatasetGraph

In [13]:
dag = DatasetGraph(catalog_path=paths['catalog_path'])

2021-10-26 10:12:57,477 - catalog - DEBUG - Loaded 3 records from 'transformers' Catalog.
2021-10-26 10:12:57,478 - catalog - DEBUG - Verifying serialization for catalog 'transformers'
2021-10-26 10:12:57,492 - catalog - DEBUG - Loaded 3 records from 'datasets' Catalog.
2021-10-26 10:12:57,495 - catalog - DEBUG - Verifying serialization for catalog 'datasets'
2021-10-26 10:12:57,502 - datasets - DEBUG - Loaded DatasetGraph with 3 nodes and 3 edges.


In [14]:
dag.add_source(output_dataset=ds_name, datasource_name=ds_name, overwrite_catalog=True)

2021-10-26 10:12:57,519 - catalog - DEBUG - Writing entry:'_penguins-raw' to catalog:'transformers'.
2021-10-26 10:12:57,523 - datasets - INFO - Regenerating output Dataset 'penguins-raw' and adding to catalog
2021-10-26 10:12:57,526 - datasets - DEBUG - Generating edge traversal list for Dataset:'penguins-raw'
2021-10-26 10:12:57,527 - datasets - DEBUG - traverse: examining vertex:'penguins-raw'
2021-10-26 10:12:57,529 - datasets - DEBUG - traverse: all input dependencies:[] satisfied for edge: '_penguins-raw'
2021-10-26 10:12:57,531 - datasets - DEBUG - Traversal complete. Edges to process: ['_penguins-raw']
2021-10-26 10:12:57,533 - datasets - DEBUG - process_edge: Processing input datasets for edge:'_penguins-raw'
2021-10-26 10:12:57,535 - datasets - DEBUG - process_edge:Applying transformer: {'transformer_module': 'src.data.datasets', 'transformer_name': 'dataset_from_datasource', 'transformer_kwargs': {'dataset_name': 'penguins-raw', 'datasource_name': 'penguins-raw'}} to input d

  0%|          | 0/1 [00:00<?, ?it/s]

2021-10-26 10:12:57,772 - datasets - INFO - Generated output datasets: ['penguins-raw'] via edge:'_penguins-raw'
2021-10-26 10:12:57,781 - datasets - DEBUG - Updating hashes for dataset 'penguins-raw': {'hashes': {'data': 'sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b', 'target': 'sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b'}}.
2021-10-26 10:12:57,782 - datasets - DEBUG - process_edge: Updating catalog entry for penguins-raw
2021-10-26 10:12:57,784 - catalog - DEBUG - Writing entry:'penguins-raw' to catalog:'datasets'.
2021-10-26 10:12:57,789 - datasets - DEBUG - process_edge: Overwriting 'penguins-raw' in `dataset_path`
2021-10-26 10:12:57,792 - datasets - DEBUG - Updating hashes for dataset 'penguins-raw': {'hashes': {'data': 'sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b', 'target': 'sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b'}}.
2021-10-26 10:12:57,799 - datasets - DEBUG - Wrote Dataset Metadata: penguins-raw.metadata
2021-10-26 10:12:57,804 - datasets - DEBUG - Re-scanning Da

{'_penguins-raw': {'transformations': [{'transformer_module': 'src.data.datasets',
    'transformer_name': 'dataset_from_datasource',
    'transformer_kwargs': {'dataset_name': 'penguins-raw',
     'datasource_name': 'penguins-raw'}}],
  'output_datasets': ['penguins-raw']}}

In [15]:
ds = Dataset.from_catalog(ds_name)

2021-10-26 10:12:57,922 - catalog - DEBUG - Loaded 3 records from 'transformers' Catalog.
2021-10-26 10:12:57,930 - catalog - DEBUG - Verifying serialization for catalog 'transformers'
2021-10-26 10:12:57,934 - catalog - DEBUG - Loaded 3 records from 'datasets' Catalog.
2021-10-26 10:12:57,936 - catalog - DEBUG - Verifying serialization for catalog 'datasets'
2021-10-26 10:12:57,948 - datasets - DEBUG - Loaded DatasetGraph with 3 nodes and 3 edges.
2021-10-26 10:12:57,950 - datasets - DEBUG - Generating edge traversal list for Dataset:'penguins-raw'
2021-10-26 10:12:57,953 - datasets - DEBUG - traverse: examining vertex:'penguins-raw'
2021-10-26 10:12:57,956 - datasets - DEBUG - traverse: all input dependencies:[] satisfied for edge: '_penguins-raw'
2021-10-26 10:12:57,961 - datasets - DEBUG - Traversal complete. Edges to process: ['_penguins-raw']
2021-10-26 10:12:57,962 - datasets - DEBUG - process_edge: Processing input datasets for edge:'_penguins-raw'
2021-10-26 10:12:57,971 - dat

  0%|          | 0/1 [00:00<?, ?it/s]

2021-10-26 10:12:58,165 - datasets - INFO - Generated output datasets: ['penguins-raw'] via edge:'_penguins-raw'
2021-10-26 10:12:58,170 - datasets - DEBUG - Updating hashes for dataset 'penguins-raw': {'hashes': {'data': 'sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b', 'target': 'sha1:38f65f3b11da4851aaaccc19b1f0cf4d3806f83b'}}.
2021-10-26 10:12:58,171 - datasets - DEBUG - process_edge: Reloading Dataset catalog after processing edge:'_penguins-raw'
2021-10-26 10:12:58,173 - catalog - DEBUG - Loaded 3 records from 'datasets' Catalog.
2021-10-26 10:12:58,175 - catalog - DEBUG - Verifying serialization for catalog 'datasets'


In [16]:
ds = Dataset.load(ds_name)

2021-10-26 10:12:58,195 - catalog - DEBUG - Loaded 3 records from 'transformers' Catalog.
2021-10-26 10:12:58,196 - catalog - DEBUG - Verifying serialization for catalog 'transformers'
2021-10-26 10:12:58,211 - catalog - DEBUG - Loaded 3 records from 'datasets' Catalog.
2021-10-26 10:12:58,212 - catalog - DEBUG - Verifying serialization for catalog 'datasets'
2021-10-26 10:12:58,215 - datasets - DEBUG - Loaded DatasetGraph with 3 nodes and 3 edges.
2021-10-26 10:12:58,217 - datasets - DEBUG - Verifying hashes using Dataset catalog.
2021-10-26 10:12:58,220 - catalog - DEBUG - Loaded 3 records from 'datasets' Catalog.
2021-10-26 10:12:58,221 - catalog - DEBUG - Verifying serialization for catalog 'datasets'
2021-10-26 10:12:58,224 - datasets - DEBUG - Load penguins-raw from disk...
2021-10-26 10:12:58,228 - datasets - DEBUG - Loaded penguins-raw from disk.


In [17]:
print(ds.DESCR)


The goal of palmerpenguins is to provide a great dataset for data exploration & visualization, as an alternative to `iris`.

More information can be found at [https://github.com/allisonhorst/palmerpenguins](https://github.com/allisonhorst/palmerpenguins).

Data were collected and made available by Dr. Kristen Gorman and the Palmer Station, Antarctica LTER, a member of the Long Term Ecological Research Network.

The data consists of measurements of bill (culmen) and flippers and weights of three species of penguins, along with some other metadata about the penguins. In total we have 334 different penguins measured.


In [18]:
ds.EXTRA

{'penguins-raw.extra': {'penguins_size.csv': ['size:13525']}}