In [None]:
from src.data import Dataset #, helpers, DatasetGraph
from src.helpers import notebook_as_transformer
from src import paths
from src.log import logger

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
logger.setLevel(logging.DEBUG)

# Using a notebook as a Dataset transformer "function"

## Create the derived Dataset

Let's create a dataset (to be used in future notebooks) to subselect the reviews that include a varietal appearing a minimal number of times (say, 75).

In [None]:
min_reviews = 75

In [None]:
ds_in = Dataset.load('wine_reviews_130k')

In [None]:
def limit_to_common_varietals(df, min_reviews=25):
    '''
    Take the subselection of the wine reviews dataset (df) that only
    contains varietals with at least "min_reviews" reviews.
    
    All entries in the final dataframe must have a variety.

    Parameters
    ----------
    df: pandas.DataFrame
        wine reviews dataframe with 'variety' as  a column
    min_reviews: int
        minimum number of reviews needed to keep a varietal

    Returns
    -------
    df_common_variety: pandas.DataFrame
        dataframe that only includes reviews with a variety that appears at least min_reviews times.
    '''
    df_variety = df.dropna(axis=0, subset=['variety']).copy()

    varietal_counts = df_variety.variety.value_counts()
    df_variety['common_varietal'] = df_variety.variety.apply(lambda x: varietal_counts[x] > min_reviews)

    df_common_variety = df_variety[df_variety.common_varietal].copy()
    df_common_variety.reset_index(inplace=True)
    df_common_variety.drop(columns=['index', 'common_varietal'], inplace=True)

    return df_common_variety


Create the new dataset, preserving (and adding to) the old metadata

In [None]:
new_dataset_name = f'{ds_in.name}_varietals_{min_reviews}'
new_data = limit_to_common_varietals(ds_in.data, min_reviews=min_reviews)
new_metadata = ds_in.metadata.copy()

added_descr_txt = f"""Subselection of the {ds_in.name} Dataset that only includes entries for wines\ 
with a given varietal that appear in at least {min_reviews} different entries."""

new_metadata['descr'] += added_descr_txt

new_ds = Dataset(dataset_name=new_dataset_name, data=new_data,
                 metadata=new_metadata)



In [None]:
new_ds.data.shape, new_ds.name, new_ds.HASHES

## Add the transformer to the catalog
Let's use this notebook as the transformer to create the derived dataset in the `DatasetGraph`

In [None]:
# Due to various design choiced in Jupyter, we need to specify this name manually.
nbname = '03-Add-Dataset-Common-Varietals.ipynb'
dsdict = notebook_as_transformer(notebook_name=nbname,
                                 input_datasets=[ds_in],
                                 output_datasets=[new_ds],
                                 overwrite_catalog=True)

In [None]:
dsdict.keys()

The one criterion for using a notebook as a transformer is that we must guarantee that the dataset is present **on disk** when we are done; i.e. `Dataset.from_disk()` works:

In [None]:
ds_ondisk = Dataset.from_disk(new_dataset_name)

In [None]:
ds_ondisk.data.shape, ds_ondisk.name, ds_ondisk.HASHES

In [None]:
new_ds.HASHES == ds_ondisk.HASHES