In [None]:
#Quick cell to make jupyter notebook use the full screen width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Enable autoreloading from src
%load_ext autoreload
%autoreload 2

In [None]:
#Some plotting libraries
import matplotlib.pyplot as plt
%matplotlib notebook
from bokeh.plotting import show, save, output_notebook, output_file
from bokeh.resources import INLINE 
output_notebook(resources=INLINE)

In [None]:
from src import workflow
from src.data import Dataset

### Read in our data

In [None]:
reviews_ds = Dataset.load('beer_review_all')
reviews = reviews_ds.data

## Transform data

This time we want one row per brewery instead of one row per reviews.

It turns out there are a number of breweries with multiple brewery_ids for the same brewery_name.  Upon examining thse breweries they are inevitably chains of brew pubs with multiple locations.  We feel that they should be treated as the same brewery.  Thus we chose to group by brewery_name instead of brewery_id.

Again, we'll add this function directly to the available transformers.

In [None]:
workflow.available_transformers()

In [None]:
from src.data.transformers import groupby_breweries

In [None]:
help(groupby_breweries)

In [None]:
breweries_ds = groupby_breweries(reviews_ds)

In [None]:
breweries = breweries_ds.data

In [None]:
breweries[breweries.num_ids>1].head(2)

In [None]:
breweries.head(2).T

In [None]:
breweries.sort_values('num_reviewers', ascending=False)

This looks alright, so let's add this to our workflow.

In [None]:
transformations=[
    ("groupby_breweries", {}),
]

In [None]:
workflow.add_transformer(input_dataset='beer_review_all',
                         transformations=transformations,
                         output_dataset="breweries_by_reviewers")

In [None]:
workflow.make_data()

Time for the analysis. Head over to [13-Analysis-Breweries-by-Sets-of-Reviewers.ipynb](13-Analysis-Breweries-by-Sets-of-Reviewers.ipynb)