In [None]:
#Quick cell to make jupyter notebook use the full screen wi"dth
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Basic utility functions
import logging
from src.logging import logger
from src.paths import Paths
from src.utils import list_dir
paths = Paths()

## Grab the Beer Review DataSource

In [None]:
from src.data import DataSource
from src import workflow

In [None]:
workflow.available_datasources()

In [None]:
dsrc = DataSource.from_name('beer_review')

In [None]:
dsrc.file_list

In [None]:
dsrc.unpack()

Now following https://github.com/acwooding/cookiecutter-easydata/blob/bus_number/%7B%7B%20cookiecutter.repo_name%20%7D%7D/notebooks/22-transform-datasources-to-datasets.ipynb

In [None]:
!ls -la $paths.interim_data_path/beer_review

Sanity check that our data looks like that of JH and the Kaggle site

In [None]:
!head $paths.interim_data_path/beer_review/beer_reviews.csv

### Make a process function

This is a nice dataset. Note really any clean up to do here. Just fixing things up to match the API.

In [None]:
import pandas as pd
import pathlib

def process_beer_review(*, unpack_dir, kind='all', extract_dir='beer_review',
                        unpack=False, raw_dir=None, metadata=None):
    """
    Process beer reviews into (data, target, metadata) format. Since we plan to use Pandas
    for further processing, data will be a pandas dataframe. As Pandas will read that too.
    
    In this case, if we 
    
    Parameters
    ----------
    unpack_dir:
        The directory the reviews have been unpacked into
    raw_dir:
        The directory the raw zip file
    kind: {'all'}
        This is an unsupervised learning example. There are no labels. We will only work
        with the whole dataset. (Optionally add train and test set later for experimenting.)
    extract_dir: 
        Name of the directory of the unpacked zip file containing the raw data files.
    unpack: boolean
        If unpack is False, process data without bothering to unpack it. Requires raw_dir.
    
    Returns
    -------
    A tuple:
        (data, target, additional_metadata)
        
    """
    if metadata is None:
        metadata = {}
    
    if unpack:
        if unpack_dir:
            unpack_dir = pathlib.Path(unpack_dir)
            data_dir = unpack_dir / extract_dir
            data = pd.read_csv(data_dir/"beer_reviews.csv")
    else:
        if raw_dir:
            raw_dir = pathlib.Path(raw_dir)
            data = pd.read_csv(raw_dir/"beerreviews.zip")
        else:
            raise ValueError("raw_dir required")
    
    target = None
    
    return data, target, metadata

In [None]:
dsrc.file_list

In [None]:
dsrc.default_metadata()

In [None]:
data, target, metadata = process_beer_review(unpack_dir=paths.interim_data_path, unpack=True)

In [None]:
data.shape

### Interesting Tidbits:

25% of the beer_beerid have no recorded beer_abv.

In [None]:
data.head().T

Looks good. Now add this as a parse function for our data.

In [None]:
from functools import partial

dsrc.parse_function = partial(process_beer_review, unpack_dir=str(paths.interim_data_path),
                              unpack=True)

In [None]:
dsrc.dataset_opts()

In [None]:
%%time
ds = dsrc.process()

In [None]:
str(ds)

In [None]:
ds.metadata

In [None]:
print(ds)

In [None]:
type(ds)

## Now that things seem to work, we need to move the process function to the src module

In [None]:
# low on space and need to move things off of my main disk.
!cd .. && make clean_raw && make clean_interim && make clean_processed

In [None]:
%%file ../src/data/localdata.py
"""
Custom dataset processing/generation functions should be added to this file
"""

import pandas as pd
import pathlib

__all__ = [
    'process_beer_review'
]


def process_beer_review(*, unpack_dir, kind='all', extract_dir='beer_review',
                        unpack=False, raw_dir=None, metadata=None):
    """
    Process beer reviews into (data, target, metadata) format. Since we plan to use Pandas
    for further processing, data will be a pandas dataframe. As Pandas will read that too.
    
    In this case, if we 
    
    Parameters
    ----------
    unpack_dir:
        The directory the reviews have been unpacked into
    raw_dir:
        The directory the raw zip file
    kind: {'all'}
        This is an unsupervised learning example. There are no labels. We will only work
        with the whole dataset. (Optionally add train and test set later for experimenting.)
    extract_dir: 
        Name of the directory of the unpacked zip file containing the raw data files.
    unpack: boolean
        If unpack is False, process data without bothering to unpack it. Requires raw_dir.
    
    Returns
    -------
    A tuple:
        (data, target, additional_metadata)
        
    """
    if metadata is None:
        metadata = {}
    
    if unpack:
        if unpack_dir:
            unpack_dir = pathlib.Path(unpack_dir)
            data_dir = unpack_dir / extract_dir
            data = pd.read_csv(data_dir/"beer_reviews.csv")
    else:
        if raw_dir:
            raw_dir = pathlib.Path(raw_dir)
            data = pd.read_csv(raw_dir/"beerreviews.zip")
        else:
            raise ValueError("raw_dir required")
    
    target = None
    
    return data, target, metadata

In [None]:
from src.data.localdata import process_beer_review

In [None]:
dsrc.parse_function = partial(process_beer_review, unpack_dir=str(paths.interim_data_path),
                              raw_dir=str(paths.raw_data_path))

In [None]:
dsrc.fetch(force=True)
dsrc.unpack(force=True)
ds = dsrc.process()

In [None]:
ds

In [None]:
workflow.add_datasource(dsrc)

In [None]:
workflow.available_datasources(keys_only=False)

In [None]:
# time to check things in. Then mount data to something with more storage!

In [None]:
# make is broken. Now that I've mounted things...

In [None]:
dsrc = DataSource.from_name('beer_review')

In [None]:
dsrc.fetch()
dsrc.unpack()
ds = dsrc.process()

In [None]:
ds

In [None]:
workflow.available_datasources()

In [None]:
workflow.available_datasets()

In [None]:
from src.data import Dataset

In [None]:
workflow.available_datasources()

In [None]:
from src import workflow

### Use a dummy transformer to get to the dataset we want

In [None]:
workflow.add_transformer(from_datasource='beer_review',
                        output_dataset='beer_review_all')

In [None]:
workflow.make_data()

In [None]:
workflow.available_datasets()