# Initial EDA

As always, let's start by importing our basic libraries.

## Some code

In [38]:
import boto3
from boto3.dynamodb.conditions import Key, Attr

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import pandas as pd
from pandas_profiling import describe_df

import numpy as np

import holoviews as hv

from pprint import pprint

import src.db_access as db
hv.extension('bokeh')

## Data Retrieval

In [5]:
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('beers')

In [6]:
beer_df = db.get_beer_df()

In [7]:
beer_df.head()

Unnamed: 0,abv,beer,beer_url,brewery,brewery_profile_url,gots,note,pdev,rank,rating,ratings_count,review_count,style,trade,wants
0,8.6,lagunitas imperial pils,https://www.beeradvocate.com/beer/profile/220/...,lagunitas brewing company,/beer/profile/220/,85.0,None provided.,9.16,13465.0,3.93,343.0,104.0,american imperial pilsner,0.0,13.0
1,7.5,1901 red ale,https://www.beeradvocate.com/beer/profile/1728...,bold city brewery,/beer/profile/17284/,8.0,None provided.,12.05,16493.0,3.9,74.0,22.0,american amber / red ale,0.0,1.0
2,4.5,conundrum,https://www.beeradvocate.com/beer/profile/2097...,kuhnhenn brewing company,/beer/profile/2097/,1.0,"Deep amber in color, this English session bier...",10.82,25095.0,3.79,21.0,6.0,english dark mild ale,0.0,0.0
3,0.5,busch na,https://www.beeradvocate.com/beer/profile/29/1...,anheuser-busch,/beer/profile/29/,8.0,"Fully brewed with finest natural ingredients, ...",37.44,47674.0,2.19,99.0,53.0,low alcohol beer,0.0,2.0
4,4.9,churchville lager,https://www.beeradvocate.com/beer/profile/2902...,neshaminy creek brewing company,/beer/profile/29021/,47.0,Pennsylvania breweries are known for world-cla...,10.24,30839.0,3.71,173.0,36.0,vienna lager,0.0,7.0


In [5]:
describe_df(beer_df)

{'table': {'n': 6444,
  'nvar': 16,
  'total_missing': 0.004073556797020484,
  'n_duplicates': 0,
  'memsize': '805.6 KiB',
  'recordsize': '128.0 B',
  'NUM': 8,
  'DATE': 0,
  'CONST': 0,
  'CAT': 5,
  'UNIQUE': 1,
  'CORR': 2,
  'RECODED': 0,
  'BOOL': 0,
  'UNSUPPORTED': 0,
  'REJECTED': 2},
 'variables':                        type correlation_var correlation count distinct_count  \
 abv                     NUM             NaN         NaN  6255            215   
 beer                    CAT             NaN         NaN  6444           6395   
 beer_url             UNIQUE             NaN         NaN  6444           6444   
 brewery                 CAT             NaN         NaN  6444           2290   
 brewery_profile_url     CAT             NaN         NaN  6444           2294   
 gots                    NUM             NaN         NaN  6444            451   
 index                   NUM             NaN         NaN  6444           2665   
 note                    CAT             N

In [6]:
beer_df.describe()

Unnamed: 0,abv,gots,pdev,rank,rating,ratings_count,review_count,trade,wants
count,6255.0,6444.0,6444.0,6213.0,6444.0,6444.0,6444.0,6444.0,6444.0
mean,6.552331,53.601955,10.811746,19689.172541,3.872093,253.738827,72.177219,1.515829,47.223929
std,2.165266,286.537749,5.293145,13430.450702,0.399402,878.730386,238.140096,14.709835,282.84006
min,0.05,0.0,1.62,1.0,1.25,10.0,0.0,0.0,0.0
25%,5.1,2.0,7.57,8470.0,3.74,17.0,5.0,0.0,0.0
50%,6.0,5.0,9.77,18429.0,3.9,36.0,11.0,0.0,2.0
75%,7.5,18.0,12.37,28498.0,4.1,123.0,37.0,0.0,10.0
max,30.86,7935.0,55.25,47748.0,4.93,17425.0,3996.0,734.0,9572.0


In [7]:
beer_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6444 entries, 0 to 1135
Data columns (total 15 columns):
abv                    6255 non-null float64
beer                   6444 non-null object
beer_url               6444 non-null object
brewery                6444 non-null object
brewery_profile_url    6444 non-null object
gots                   6444 non-null float64
note                   6444 non-null object
pdev                   6444 non-null float64
rank                   6213 non-null float64
rating                 6444 non-null float64
ratings_count          6444 non-null float64
review_count           6444 non-null float64
style                  6444 non-null object
trade                  6444 non-null float64
wants                  6444 non-null float64
dtypes: float64(9), object(6)
memory usage: 805.5+ KB


In [8]:
beer_df.columns

Index(['abv', 'beer', 'beer_url', 'brewery', 'brewery_profile_url', 'gots',
       'note', 'pdev', 'rank', 'rating', 'ratings_count', 'review_count',
       'style', 'trade', 'wants'],
      dtype='object')

In [9]:
beer_dataset = hv.Dataset(beer_df, ['abv', 'beer', 'beer_url', 'brewery', 'brewery_profile_url', 'gots',
       'note', 'pdev', 'ratings_count', 'review_count',
       'style', 'trade', 'wants'], ['rank', 'rating'])

## Let's start visualizing

In [10]:
beer_dataset

:Dataset   [abv,beer,beer_url,brewery,brewery_profile_url,gots,note,pdev,ratings_count,review_count,style,trade,wants]   (rank,rating)

In [11]:
scatter = beer_dataset.to(hv.Scatter, 'abv', 'rating', groupby=['style'])
scatter.opts(tools=['hover'])


In [12]:
scatter = beer_dataset.to(hv.Scatter, 'abv', 'rating', groupby=[])
scatter.opts(tools=['hover'])

In [13]:
scatter2 = beer_dataset.to(hv.Scatter, vdims=['review_count', 'abv', 'rating'], groupby=[])
scatter2.opts(color='rating', width=600, height=600, colorbar=True, logy=True, logx=False)




In [14]:
scatter+scatter2



In [15]:
rating_boxwhiskers = beer_dataset.to(hv.BoxWhisker, kdims=['style'], vdims='rating', groupby=['style'])
rating_boxwhiskers.opts(height=500, width=250, ylim=(2, 5))

In [16]:
layout = rating_boxwhiskers.layout()
layout.cols(5)

In [17]:
beer_agg = beer_dataset.aggregate('style', np.mean)
agg_plot = beer_agg.to(hv.Scatter, 'style', 'rating')
agg_plot.opts(width=750, height=750, size=5,xrotation=75, tools=['hover'], title='Mean Rating by Style', ylim=(2,5.25))

## Reviews

In [16]:
beer_sample = beer_df.sample(500)

In [9]:
beer_sample.head()

Unnamed: 0,abv,beer,beer_url,brewery,brewery_profile_url,gots,note,pdev,rank,rating,ratings_count,review_count,style,trade,wants
367,6.0,rogue farms good chit pilsner,https://www.beeradvocate.com/beer/profile/132/...,rogue ales,/beer/profile/132/,38.0,None provided.,13.76,25391.0,3.78,403.0,98.0,bohemian pilsener,0.0,26.0
1140,12.5,bishop's barrel 23,https://www.beeradvocate.com/beer/profile/337/...,saint arnold brewing company,/beer/profile/337/,0.0,None provided.,2.73,,4.4,13.0,3.0,german doppelbock,0.0,0.0
1886,5.2,half acre lager,https://www.beeradvocate.com/beer/profile/1800...,half acre beer company,/beer/profile/18006/,2.0,We open the Half Acre time capsule and pull ou...,11.45,37748.0,3.58,60.0,31.0,american amber / red lager,0.0,4.0
1612,5.5,firefly,https://www.beeradvocate.com/beer/profile/3116...,bad weather brewing company,/beer/profile/31165/,6.0,None provided.,7.99,18066.0,3.88,59.0,13.0,rye beer,0.0,0.0
900,5.6,lotus ipa,https://www.beeradvocate.com/beer/profile/2424...,ilkley brewery co.,/beer/profile/24247/,0.0,None provided.,10.65,22403.0,3.85,12.0,5.0,english india pale ale (ipa),0.0,1.0


In [17]:
beer_sample['review_id'] = beer_sample['brewery']+ ' ' + beer_sample['beer']
beer_sample['review_id'].head()

268               franciscan well micro brewery rebel red
859     edmund's oast brewing company lord proprietor'...
144                magic hat brewing company mother lager
1388    shawneecraft brewing company bourbon barrel po...
2481                    persephone brewing keller pilsner
Name: review_id, dtype: object

In [18]:
reviews = db.get_reviews_df(list(beer_sample['review_id']))

In [65]:
test = pd.read_csv('data/reviews_df.csv')

In [66]:
test.head()

Unnamed: 0.1,Unnamed: 0,author,beer,feel,look,overall,review_id,smell,taste,text
0,0,/community/members/anspailpin.1211201/,rebel red,4.25,5.0,5.0,franciscan well micro brewery rebel red 0,4.5,5.0,4.81/5 rDev +29.6%look: 5 | smell: 4.5 | tast...
1,0,/community/members/cwbern.989413/,rebel red,3.5,4.25,3.75,franciscan well micro brewery rebel red 1,3.25,3.75,3.64/5 rDev -1.9%look: 4.25 | smell: 3.25 | t...
2,0,/community/members/stonedtrippin.601042/,rebel red,3.75,3.5,3.75,franciscan well micro brewery rebel red 2,3.75,3.75,3.74/5 rDev +0.8%look: 3.5 | smell: 3.75 | ta...
3,0,/community/members/ciocanelu.691982/,rebel red,3.25,3.75,3.5,franciscan well micro brewery rebel red 3,3.75,3.25,3.45/5 rDev -7%look: 3.75 | smell: 3.75 | tas...
4,0,/community/members/fallenshadow.744524/,rebel red,4.0,5.0,5.0,franciscan well micro brewery rebel red 4,4.5,5.0,4.78/5 rDev +28.8%look: 5 | smell: 4.5 | tast...


In [57]:
test.head()

Unnamed: 0,author,beer,feel,look,overall,review_id,smell,taste,text
0,/community/members/anspailpin.1211201/,rebel red,4.25,5.0,5.0,franciscan well micro brewery rebel red 0,4.5,5.0,4.81/5 rDev +29.6%look: 5 | smell: 4.5 | tast...
0,/community/members/cwbern.989413/,rebel red,3.5,4.25,3.75,franciscan well micro brewery rebel red 1,3.25,3.75,3.64/5 rDev -1.9%look: 4.25 | smell: 3.25 | t...
0,/community/members/stonedtrippin.601042/,rebel red,3.75,3.5,3.75,franciscan well micro brewery rebel red 2,3.75,3.75,3.74/5 rDev +0.8%look: 3.5 | smell: 3.75 | ta...
0,/community/members/ciocanelu.691982/,rebel red,3.25,3.75,3.5,franciscan well micro brewery rebel red 3,3.75,3.25,3.45/5 rDev -7%look: 3.75 | smell: 3.75 | tas...
0,/community/members/fallenshadow.744524/,rebel red,4.0,5.0,5.0,franciscan well micro brewery rebel red 4,4.5,5.0,4.78/5 rDev +28.8%look: 5 | smell: 4.5 | tast...


In [20]:
reviews.describe()

Unnamed: 0,feel,look,overall,smell,taste
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,3.6875,3.8875,3.8875,3.7125,3.8375
std,0.529619,0.556102,0.567231,0.64363,0.587795
min,2.75,3.0,3.0,2.5,3.0
25%,3.4375,3.5,3.5,3.25,3.5
50%,3.5,3.875,3.75,3.875,3.75
75%,4.0,4.0,4.0,4.0,4.0
max,5.0,5.0,5.0,5.0,5.0


In [21]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 0
Data columns (total 9 columns):
author       10000 non-null object
beer         10000 non-null object
feel         10000 non-null float64
look         10000 non-null float64
overall      10000 non-null float64
review_id    10000 non-null object
smell        10000 non-null float64
taste        10000 non-null float64
text         10000 non-null object
dtypes: float64(5), object(4)
memory usage: 781.2+ KB


## Testing

In [18]:
hv.help(hv.Scatter)

Scatter

Online example: http://holoviews.org/reference/elements/bokeh/Scatter.html

[1;35m-------------
Style Options
-------------[0m

	alpha, angle, cmap, color, fill_alpha, fill_color, hover_alpha, hover_color, hover_fill_alpha, hover_fill_color, hover_line_alpha, hover_line_color, line_alpha, line_cap, line_color, line_dash, line_join, line_width, marker, muted_alpha, muted_color, muted_fill_alpha, muted_fill_color, muted_line_alpha, muted_line_color, nonselection_alpha, nonselection_color, nonselection_fill_alpha, nonselection_fill_color, nonselection_line_alpha, nonselection_line_color, palette, selection_alpha, selection_color, selection_fill_alpha, selection_fill_color, selection_line_alpha, selection_line_color, size

(Consult bokeh's documentation for more information.)

[1;35m------------
Plot Options
------------[0m

The plot options are the parameters of the plotting class:

[1;32mParameters of 'PointPlot'
[0m
[1;31mParameters changed from their default values are 

In [22]:
reviews.to_csv('reviews_df.csv')

In [23]:
beer_sample.to_csv('beer_sample.csv')

In [24]:
reviews['text'].head()

0    4.81/5  rDev +29.6%look: 5 | smell: 4.5 | tast...
0    3.64/5  rDev -1.9%look: 4.25 | smell: 3.25 | t...
0    3.74/5  rDev +0.8%look: 3.5 | smell: 3.75 | ta...
0    3.45/5  rDev -7%look: 3.75 | smell: 3.75 | tas...
0    4.78/5  rDev +28.8%look: 5 | smell: 4.5 | tast...
Name: text, dtype: object

In [25]:
documents = list(reviews['text'])

In [33]:
vectorizer = TfidfVectorizer()

In [34]:
doc_term = vectorizer.fit_transform(documents)

In [44]:
nmf = NMF(n_components=10)

In [45]:
W = nmf.fit_transform(doc_term)

In [69]:
W[500]

array([0.08513169, 0.        , 0.        , 0.        , 0.00410748,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [67]:
test.iloc[0]

Unnamed: 0                                                    0
author                   /community/members/anspailpin.1211201/
beer                                                  rebel red
feel                                                       4.25
look                                                          5
overall                                                       5
review_id             franciscan well micro brewery rebel red 0
smell                                                       4.5
taste                                                         5
text          4.81/5  rDev +29.6%look: 5 | smell: 4.5 | tast...
Name: 0, dtype: object

In [68]:
test.iloc[500]

Unnamed: 0                                                    0
author                   /community/members/anspailpin.1211201/
beer                                                  rebel red
feel                                                       4.25
look                                                          5
overall                                                       5
review_id             franciscan well micro brewery rebel red 0
smell                                                       4.5
taste                                                         5
text          4.81/5  rDev +29.6%look: 5 | smell: 4.5 | tast...
Name: 500, dtype: object