In [None]:
#Quick cell to make jupyter notebook use the full screen wi"dth
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# Basic utility functions
import logging
from src.logging import logger
from src.paths import Paths
from src.utils import list_dir
paths = Paths()

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#Some plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook
from bokeh.plotting import show, save, output_notebook, output_file
from bokeh.resources import INLINE 
output_notebook(resources=INLINE)

## Tranformation for doing Beer Style by Review Ratings
Exactly as in this notebook: 
https://github.com/jc-healy/EmbedAllTheThings/blob/master/EmbedAllTheThings_Beer_style_by_review_ratings.ipynb


In [None]:
# Grab the processed dataset
from src.data import Dataset
from src import workflow

workflow.available_datasets()

In [None]:
ds = Dataset.load('beer_review_all')
srm_ds = Dataset.load('beer_styles_srm_all')

In [None]:
ds.data.head().T

In [None]:
srm_ds.data.head()

## Embed Beer Style

If we are going to embed beer style then we need to turn our reviews data frame into a frame with one row per beer style instead of one row per review.

This is a job for groupby.  We groupby the column we'd like to embedd and then use agg with a dictionary of column names to aggregation functions to tell it how to summarize the many reviews about a single beer into one record.  Aggregation functions are pretty much any function that takes an iterable and returns a single value.  `Median` and `max` are great functions for dealing with numeric fields.  `First` is handy for a field that you know to be common across for every beer review.  In other words fields that are tied to the beer such as brewery_name or beer_abv.  An alternative to first is to write a simple lambda function to pull off the most common categorical variable in each groupby.

In [None]:
reviews = ds.data
srm_data = srm_ds.data

In [None]:
beer_style = reviews.groupby('beer_style').agg({
    'beer_name':lambda x: x.mode(),
    'brewery_name':lambda x: x.mode(),
    'beer_abv':'mean',
    'review_aroma':'mean',
    'review_appearance':'mean',
    'review_palate':'mean',
    'review_taste':'mean',
    'review_overall':'mean',
    'review_profilename':len
}).reset_index()

beer_style.columns = """beer_style common_beer common_brewer abv 
aroma appearance overall palate taste 
num_reviews""".split()
beer_style.shape

In [None]:
beer_style.tail(2).T

In [None]:
beer_style.sort_values('num_reviews', ascending=False)

# Next step is augmenting the data with some other data...


In [None]:
beer_style = beer_style.merge(srm_data[['kaggle_review_style','Style Category','SRM Mid','srm_rgb']], how='left',
                left_on='beer_style', right_on='kaggle_review_style')
beer_style.rename(columns={'SRM Mid':'srm_mid', 'Style Category':'style_category'}, inplace=True)

In [None]:
beer_style.head(3).T

### Select some numeric columns as candidates for your embedding

Here we'll use the 4 numeric rating columns which rate characteristics of the beer.  We'll leave the overall score out for the time being.

In [None]:
numeric_columns = 'aroma appearance palate taste'.split()
style_by_reviews = beer_style[numeric_columns]
style_by_reviews.describe()

In [None]:
plt.figure(figsize=(12, 8))
#sns.set(font_scale = 3)
f =sns.violinplot(data=style_by_reviews)
#f.get_figure().savefig('results/beer_style_numeric_violin.png')

## This is the data to be used

In [None]:
style_by_reviews

# Translate this into a reproducible transformation

Create a custom function to do this tranformation.

In [None]:
def to_beer_style_by_review(review_dset, *, srm_dset_name):
    """
    Augments beer reviews with SRM (color assessment) data.
    
    See notebook 04 for what this does and why.
    
    Parameters
    ----------
    review_dset:
        beer review dataset as a pandas dataframe
    srm_dset_name: string
        name of corresponding srm Dataset

    Returns
    -------
    style-by-review DataFrame
    """
    
    reviews = review_dset.data
    srm_data = Dataset.load(srm_dset_name).data
    
    # Groupby to select the fields that we want to use
    beer_style = reviews.groupby('beer_style').agg({
        'beer_name':lambda x: x.mode(),
        'brewery_name':lambda x: x.mode(),
        'beer_abv':'mean',
        'review_aroma':'mean',
        'review_appearance':'mean',
        'review_palate':'mean',
        'review_taste':'mean',
        'review_overall':'mean',
        'review_profilename':len
    }).reset_index()

    beer_style.columns = """beer_style common_beer common_brewer abv 
    aroma appearance overall palate taste 
    num_reviews""".split()
    
    # Augment beer style with SRM and RGB data
    beer_style = beer_style.merge(srm_data[['kaggle_review_style','Style Category',
                                            'SRM Mid','srm_rgb']], how='left',
                    left_on='beer_style', right_on='kaggle_review_style')
    beer_style.rename(columns={'SRM Mid':'srm_mid', 'Style Category':'style_category'},
                      inplace=True)
    
    return beer_style

In [None]:
# Test out the function
new_style_by_reviews = to_beer_style_by_review(ds, srm_dset_name='beer_styles_srm_all')

In [None]:
new_style_by_reviews.describe()

In [None]:
style_by_reviews.describe()

Looks like we have a winner.

Next up, put this in the src module (want it to go in localdata.py or a transformer version of that). Currently it should go into `transformers.py`

In [None]:
workflow.available_transformers()

In [None]:
workflow.available_datasets()

In [None]:
srm_transform = ('add_srm_to_reviews',{'srm_dset_name':'beer_styles_srm_all'})

In [None]:
# Add the new transformation
workflow.add_transformer(input_dataset='beer_review_all',
                         transformations=[srm_transform],
                         output_dataset='beer_style')

In [None]:
workflow.available_datasets()

In [None]:
workflow.available_transformers()

In [None]:
workflow.get_transformer_list()

In [None]:
#workflow.del_transformer(2)

In [None]:
workflow.make_data()

In [None]:
workflow.available_datasets()

In [None]:
beer_style = Dataset.load('beer_style')

In [None]:
beer_style.data

In [None]:
beer_style.metadata