In [None]:
#Quick cell to make jupyter notebook use the full screen width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#Some plotting libraries
import matplotlib.pyplot as plt
%matplotlib notebook
from bokeh.plotting import show, save, output_notebook, output_file
from bokeh.resources import INLINE 
output_notebook(resources=INLINE)

In [None]:
from src import workflow
from src.data import Dataset

### Read in our data

In [None]:
workflow.available_datasets()

In [None]:
ds = Dataset.load('beer_review_all')

In [None]:
reviews = ds.data

## Turning Beer Styles into Sets of Reviewers

If we are going to embed beer styles by sets of reviewers then we need to turn our reviews data frame into a frame with one row per beer style instead of one row per review.

This is a job for groupby.  We groupby the column we'd like to embedd and then use agg with a dictionary of column names to aggregation functions to tell it how to summarize the many reviews about a single beer into one record.  Aggregation functions are pretty much any function that takes an iterable and returns a single value.  Median and max are great functions for dealing with numeric fields.  First is handy for a field that you know to be common across for every beer review.  In other words fields that are tied to the beer such as brewery_name or beer_abv.

We'd like to turn categorical data into a document of space seperated strings.  We want to do this to keep a nice easy pipeline for sklearns CountVectorizer.  A very natural way to accomplish this is via pandas df.groupby() function with a " ".join(my_array) aggregator passed in.  Unfortunately, it turns out that " ".join(my_array) seems to have trouble on for lists (or sequences) longer than 3,000 or so.  

As such we've included a simple (though not necessarily efficient) join function that scales to large arrays.

In [None]:
from src.utils import custom_join

In [None]:
help(custom_join)

In [None]:
%%time
unique_join = lambda x: custom_join(x.unique(), " ")
beer_style = reviews.groupby('beer_style').agg({
    'beer_name':lambda x: x.mode(),
    'brewery_name':lambda x: x.mode(),
    'beer_abv':'mean',
    'review_aroma':'mean',
    'review_appearance':'mean',
    'review_overall':'mean',
    'review_palate':'mean',
    'review_taste':'mean',
    'review_profilename':[unique_join, len],
    'brewery_id':lambda x: len(x.unique()),
}).reset_index()

beer_style.columns = """beer_style beer_name brewery_name beer_abv 
review_aroma review_appearance review_overall review_palate review_taste 
review_profilename_list num_reviewers num_ids""".split()
beer_style.shape

In [None]:
beer_style.head(2).T

## Add this as a transformer

In [None]:
def groupby_style_to_reviewers(review_dset):
    """
    Turn our reviews data frame into a frame with one row per beer style instead of one row per review.

    We groupby the column we'd like to embedd and then use agg with a dictionary of column names to 
    aggregation functions to tell it how to summarize the many reviews about a single beer into one record.
    (Median and max are great functions for dealing with numeric fields).
    
    Parameters
    ----------
    review_dset: Dataset
        Dataset containing the beer reviews data
        
    Returns
    -------
    beer style dataset with a dataframe representing beer style by reviewers
    """
    reviews = review_dset.data
    unique_join = lambda x: custom_join(x.unique(), " ")
    beer_style = reviews.groupby('beer_style').agg({
        'beer_name':lambda x: x.mode(),
        'brewery_name':lambda x: x.mode(),
        'beer_abv':'mean',
        'review_aroma':'mean',
        'review_appearance':'mean',
        'review_overall':'mean',
        'review_palate':'mean',
        'review_taste':'mean',
        'review_profilename':[unique_join, len],
        'brewery_id':lambda x: len(x.unique()),
    }).reset_index()

    beer_style.columns = """beer_style beer_name brewery_name beer_abv 
    review_aroma review_appearance review_overall review_palate review_taste 
    review_profilename_list num_reviewers num_ids""".split()
    ds_reviewers = Dataset(dataset_name="beer_style_reviewers", metadata=review_dset.metadata, data=beer_style)
    return ds_reviewers

In [None]:
from src.data.transformers import groupby_style_to_reviewers

In [None]:
new_ds = groupby_style_to_reviewers(ds)

In [None]:
beer_style = new_ds.data

In [None]:
new_ds.metadata

Looks good. Now add it to `transformers.py`. And the workflow.

In [None]:
workflow.available_datasets()

In [None]:
transformations=[
    ("groupby_style_to_reviewers", {}),
]

In [None]:
workflow.add_transformer(input_dataset='beer_review_all', transformations=transformations, output_dataset="beer_style_reviewers")

In [None]:
workflow.make_data()

In [None]:
beer_style = Dataset.load("beer_style_reviewers")

In [None]:
beer_style.data.head().T

We now have everything we need to do our embedding of beer styles via sets of reviewers. See [07-Analysis-Beer-Style-by-Sets-of-Reviewers.ipynb](07-Analysis-Beer-Style-by-Sets-of-Reviewers.ipynb)