In [9]:
import requests
from bs4 import BeautifulSoup
from lxml import html
import pandas as pd
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set( stopwords.words('english'))
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import spacy 
import numpy as np
import string
import vaderSentiment

### Cleaning the data

In [12]:
reviews_df = pd.read_csv('reviews.csv')
reviews_df[:10]

Unnamed: 0,product_name,product_review,user_rating
0,Kentucky Brunch Brand Stout,"2016 Silver Wax. Aroma has whiskey, maple, tof...",4.8
1,Kentucky Brunch Brand Stout,The beer pours Pitch Black with a frothy tan h...,4.74
2,Kentucky Brunch Brand Stout,Probably the smoothest beer I have ever had. S...,4.68
3,Kentucky Brunch Brand Stout,"Dark black, very thick, a little bit of tan he...",5.0
4,Kentucky Brunch Brand Stout,Poured black as ink with thin ruby edges at 58...,4.97
5,Kentucky Brunch Brand Stout,she’s got heat....but man is she somethin nice...,4.75
6,Kentucky Brunch Brand Stout,Amazing brew. The maple aroma pours out of thi...,5.0
7,Kentucky Brunch Brand Stout,Finally got to try the white whale. Pours like...,4.3
8,Kentucky Brunch Brand Stout,Had this on tap at the 12-15-18 release in Dec...,4.85
9,Kentucky Brunch Brand Stout,Had this on tap At the KBBS release 12/15/18 p...,4.89


In [14]:
#Checking if there are 25 reviews of each product
reviews_df.groupby(['product_name']).count()

Unnamed: 0_level_0,product_review,user_rating
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1
3rd Anniversary Imperial IPA,25,25
4th Anniversary,22,22
A Deal With The Devil,25,25
Aaron,25,25
Abner,25,25
Abrasive Ale,25,25
Abraxas,25,25
Abricot Du Fermier,25,25
Abt 12,25,25
Adam From The Wood,25,25


In [15]:
reviews_df.columns

Index(['product_name', 'product_review', 'user_rating'], dtype='object')

In [18]:
reviews_df = reviews_df.dropna()
reviews_df['cleaned_review'] = reviews_df['product_review'].apply(lambda x :x.translate(str.maketrans('', '', string.punctuation)))
reviews_df['cleaned_review'] = reviews_df['cleaned_review'].apply(lambda x :x.lower())

reviews_df['cleaned_review'] = reviews_df['cleaned_review'].apply(word_tokenize).apply(set).apply(list)

def remove_stopwords(s):
    return [w for w in s if not w in stop_words] 
    
reviews_df['cleaned_review'] =  reviews_df['cleaned_review'].apply(remove_stopwords)

### Word Frequency Analysis and Attribute Selection

In [19]:
#Creating a combined list of all words
all_words = []
for i in range(len(reviews_df)):
    all_words+=reviews_df['cleaned_review'][i]
from nltk import FreqDist
word_freq = nltk.FreqDist(all_words)

In [20]:
word_freq.most_common()

[('head', 3407),
 ('beer', 2785),
 ('taste', 2595),
 ('pours', 1725),
 ('nose', 1666),
 ('sweet', 1661),
 ('dark', 1621),
 ('like', 1598),
 ('one', 1596),
 ('carbonation', 1586),
 ('finish', 1547),
 ('chocolate', 1536),
 ('aroma', 1517),
 ('nice', 1445),
 ('mouthfeel', 1444),
 ('good', 1430),
 ('well', 1418),
 ('body', 1396),
 ('overall', 1374),
 ('light', 1355),
 ('lacing', 1352),
 ('bottle', 1349),
 ('medium', 1314),
 ('notes', 1310),
 ('black', 1290),
 ('flavor', 1285),
 ('smooth', 1235),
 ('white', 1224),
 ('vanilla', 1184),
 ('bit', 1167),
 ('color', 1166),
 ('fruit', 1148),
 ('bourbon', 1145),
 ('little', 1141),
 ('glass', 1136),
 ('great', 1130),
 ('orange', 1119),
 ('flavors', 1097),
 ('really', 1097),
 ('thick', 1096),
 ('feel', 1093),
 ('poured', 1077),
 ('smell', 1069),
 ('coffee', 1043),
 ('citrus', 1007),
 ('bitterness', 944),
 ('barrel', 932),
 ('oak', 903),
 ('brown', 892),
 ('creamy', 874),
 ('much', 870),
 ('sweetness', 856),
 ('hazy', 825),
 ('malt', 818),
 ('dry', 78

Selected attributes: 
1. **Crisp** - Highly carbonated; effervescent
2. **Hoppy** - Herbal, earthy, spicy, or citric aromas and flavors of hops
3. **Robust** - Rich and full-bodied


### Similarity analysis with the 3-attribute set and the reviews.

In [33]:
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

In [34]:
def join_words(comment):   
    """Joins the tokenized words to a sentence"""
    return " ".join(comment) 

reviews_df['joined_review'] = reviews_df['cleaned_review'].map(join_words)

In [35]:
#Calculate similarity with pre-processing functions
def calculate_similarity(comment):
    """Compute similarity score"""
    base = nlp(comment)
    compare = nlp(input_attributes)
    return base.similarity(compare)

In [36]:
input_list = ['crisp', 'hoppy', 'robust']
input_attributes =  " ".join(input_list)
reviews_df['similarity'] = reviews_df['joined_review'].map(calculate_similarity)

In [37]:
reviews_df.head(10)

Unnamed: 0,product_name,product_review,user_rating,cleaned_review,joined_review,similarity
0,Kentucky Brunch Brand Stout,"2016 Silver Wax. Aroma has whiskey, maple, tof...",4.8,"[barrel, taste, umami, cacao, aroma, toffee, w...",barrel taste umami cacao aroma toffee wood map...,0.635786
1,Kentucky Brunch Brand Stout,The beer pours Pitch Black with a frothy tan h...,4.74,"[creams, vanilla, frothy, pours, bottle, tan, ...",creams vanilla frothy pours bottle tan feel bl...,0.602877
2,Kentucky Brunch Brand Stout,Probably the smoothest beer I have ever had. S...,4.68,"[ever, alcohol, vanilla, taste, barrel, notes,...",ever alcohol vanilla taste barrel notes probab...,0.592445
3,Kentucky Brunch Brand Stout,"Dark black, very thick, a little bit of tan he...",5.0,"[ever, alcohol, either, blow, vanilla, sweet, ...",ever alcohol either blow vanilla sweet taste l...,0.53649
4,Kentucky Brunch Brand Stout,Poured black as ink with thin ruby edges at 58...,4.97,"[astringency, fizzy, strong, velvety, whole, b...",astringency fizzy strong velvety whole booze 5...,0.63883
5,Kentucky Brunch Brand Stout,she’s got heat....but man is she somethin nice...,4.75,"[somethin, barrel, nose, booze, flavor, maple,...",somethin barrel nose booze flavor maple molass...,0.584269
6,Kentucky Brunch Brand Stout,Amazing brew. The maple aroma pours out of thi...,5.0,"[brew, pours, bottle, aroma, brownish, af, bla...",brew pours bottle aroma brownish af black carb...,0.626235
7,Kentucky Brunch Brand Stout,Finally got to try the white whale. Pours like...,4.3,"[breweries, pepper, like, think, pours, tg, gr...",breweries pepper like think pours tg green sme...,0.481698
8,Kentucky Brunch Brand Stout,Had this on tap at the 12-15-18 release in Dec...,4.85,"[barrel, release, high, maybe, 121518, good, m...",barrel release high maybe 121518 good mix arom...,0.589091
9,Kentucky Brunch Brand Stout,Had this on tap At the KBBS release 12/15/18 p...,4.89,"[great, barrel, taste, release, silky, 121518,...",great barrel taste release silky 121518 attemp...,0.630627


In [38]:
top300_reviews = reviews_df.sort_values(by='similarity', ascending=False)[0:300]

In [39]:
top300_reviews

Unnamed: 0,product_name,product_review,user_rating,cleaned_review,joined_review,similarity
2487,Notorious Triple IPA,Had on tap. Just a luscious and delicious dipa...,4.49,"[hops, citrus, delicious, bready, backbone, 12...",hops citrus delicious bready backbone 12 hazy ...,0.797940
2290,Flora Plum,"2018 release. A: Pours a very light pale, haz...",4.44,"[earthy, release, spice, sourness, delicious, ...",earthy release spice sourness delicious minima...,0.795650
4107,Saison Bernice,Tasted from bottle. Classic farmhouse saison. ...,4.44,"[earthy, mouthfeel, citrus, pours, bottle, cri...",earthy mouthfeel citrus pours bottle crisp fun...,0.794142
5609,Vicinity,Pours a golden peach with a fluffy fast dissip...,4.31,"[hops, crackery, fruits, backbone, pours, soft...",hops crackery fruits backbone pours soft crisp...,0.792530
4330,Bad Boy,Shared from a growler with Ryan. Pours a prett...,4.25,"[hops, easy, tongue, citrus, pretty, backbone,...",hops easy tongue citrus pretty backbone pours ...,0.792521
1691,Doppelganger,"Dark, murky orange in color with an unfiltered...",4.43,"[great, murky, sweet, warmth, easy, citrus, si...",great murky sweet warmth easy citrus silky not...,0.790991
5784,Axe Man,16oz can dated 9/6/19. Pours clear gold with ...,4.25,"[16oz, balancing, finishes, mouthfeel, clear, ...",16oz balancing finishes mouthfeel clear haze c...,0.790588
1044,King Sue,"16 oz can. Packaged 8/28/19, best by 12/26/19....",3.03,"[hops, reserved, sweet, mouthfeel, 122619, pou...",hops reserved sweet mouthfeel 122619 pours 828...,0.790280
1717,Ephraim,"Bright, refereshing, peppery, orange citrus, p...",4.60,"[great, refereshing, citrus, big, piney, brigh...",great refereshing citrus big piney bright 10 w...,0.789249
1557,Society & Solitude #4,"06/2014 - Massive pine, some tropical fruit, g...",4.44,"[sharp, 062014, massive, flavorful, hoppiness,...",sharp 062014 massive flavorful hoppiness bitte...,0.788851


### Sentiment analysis on top 300 reviews and Sorting them (high to low) by the sentiment scores.

In [40]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [41]:
def sentiment_analyzer_scores(sentence):
    """Generate seniment score"""
    scores = analyser.polarity_scores(sentence)
    return scores['compound']

top300_reviews['sentiment'] = top300_reviews['product_review'].map(sentiment_analyzer_scores)

In [43]:
top300_reviews_sentiment = top300_reviews.sort_values(by='sentiment', ascending=False)

In [44]:
top300_reviews_sentiment

Unnamed: 0,product_name,product_review,user_rating,cleaned_review,joined_review,similarity,sentiment
3458,Darkness,"750 ml bottle into snifter, bottled on 9/25/20...",4.36,"[example, warms, old, great, huge, modestly, g...",example warms old great huge modestly grass po...,0.735552,0.9976
1626,Oude Geuze Vintage,"750ml bottle, 2008 vintage, poured into a Drie...",4.58,"[great, spice, pours, aroma, special, green, j...",great spice pours aroma special green journal ...,0.742722,0.9972
4930,Ghost In The Machine,"12 ounce bottle into tulip glass, bottled on 8...",4.28,"[example, great, 8, huge, pours, highly, danke...",example great 8 huge pours highly dankearthy e...,0.744781,0.9966
5044,Emerald Grouper,"16 ounce can into tulip glass, canned on 9/24/...",4.23,"[example, great, astringency, huge, pours, hig...",example great astringency huge pours highly 95...,0.764206,0.9961
3291,Mocha Wednesday,"750 ml bottle into snifter, bottled on 1/27/20...",4.20,"[warms, great, almond, still, khaki, huge, pou...",warms great almond still khaki huge pours brue...,0.732166,0.9959
4654,Art,"375ml bottle, Batch 5, dated March 18, 2015, p...",4.67,"[vinous, great, beautifully, spice, march, ber...",vinous great beautifully spice march berry pou...,0.735175,0.9951
3512,Montmorency Vs Balaton,500 ml bottle into signature tulip glass; Blen...,4.37,"[example, great, tier, huge, modestly, grass, ...",example great tier huge modestly grass pours h...,0.735357,0.9950
5315,Flora,"750ml bottle, Batch #6, poured into a Hill Far...",4.51,"[example, great, another, berry, pours, shauns...",example great another berry pours shauns oaky ...,0.745116,0.9948
5827,Thicket,"375ml bottle, generously shared by Erik, poure...",4.19,"[used, pours, blackberry, 375ml, aroma, offers...",used pours blackberry 375ml aroma offers yeast...,0.742931,0.9945
2497,Notorious Triple IPA,had this on tap at the mule bar in Winooski VT...,4.34,"[isnt, still, bar, triple, necessarily, maybe,...",isnt still bar triple necessarily maybe pours ...,0.743096,0.9939


### Recommend 3 beers to the customer based on similarity and sentiment

In [45]:
top300_reviews_combined = top300_reviews.groupby('product_name')[['similarity','sentiment']].mean()
top300_reviews_combined['recommend'] = top300_reviews_combined['similarity']+top300_reviews_combined['sentiment']

In [46]:
top300_reviews_combined.sort_values(by='recommend', ascending=False)[0:3]

Unnamed: 0_level_0,similarity,sentiment,recommend
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Emerald Grouper,0.764206,0.9961,1.760306
Flora,0.747172,0.9906,1.737772
Thicket,0.742931,0.9945,1.737431


### Recommend 3 beers to the customer based on similarity, sentiment and user rating

In [56]:
reviews_df['sentiment'] = reviews_df['product_review'].map(sentiment_analyzer_scores)

In [58]:
reviews_df_combined = reviews_df.groupby('product_name')[['similarity','sentiment', 'user_rating']].mean()
reviews_df_combined['recommend'] = reviews_df_combined['similarity']+reviews_df_combined['sentiment']

In [61]:
reviews_df_combined.sort_values(by = 'user_rating', ascending = False)[0:3]

Unnamed: 0_level_0,similarity,sentiment,user_rating,recommend
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chemtrailmix,0.591402,0.883917,4.816667,1.475318
Kentucky Brunch Brand Stout,0.568817,0.787348,4.812,1.356165
It Was All A Dream,0.578521,0.600915,4.785,1.179436


In [65]:
reviews_df_combined['recommend'].min()

1.1332525622299112

### Difference in the two recommendation methods
The recommendations change quite a bit between the two methods. One major reason is because we are using the entire dataset of reviews rather than those that are most similar to the attributes we are looking for. We are looking at all reviews for all products, regardless of the review's relevance to the input attributes (we used 'crisp', 'hoppy', and 'robust'). In general for something as subjective as beer preferences, using all of the reviews is not going to recommend anything personal to the user. Just because a lot of people like it doesn't mean that everyone likes it. For example, it's a hot topic right now whether or not pineapple should be a pizza topping. There are millions of people who love pineapple on pizza and will take to the internet to explain why it's great, but that doesn't mean that I will like pineapple pizza. I might prefer my pizza to be more savory and meaty rather than sweet, and that's exactly what is going on here. Just because lots of people like IPAs and that they are highly rated doesn't mean that everyone will like them- I've spat out a highly rated IPA before because I thought it was way to bitter and intense, which is why a lot of people liked it. This isn't about functionality- it's about taste, which differs person to person. That is why recommending the top rated beers without paying attention to similarity and sentiment isn't a good idea- you can see from the output above that the three top rated beers overall do not all fit well with our input attributes. One of them is actually one of the worst recommendation scores that we have in the dataset with a score of 1.179436. The other two perform closer to the mean recommendation of the set. Ultimately, blindly recommending high-rated beers will not meet the requirements of the user looking for recommendations. 