In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
beers = pd.read_csv('beers-breweries-and-beer-reviews/beers.csv')
breweries = pd.read_csv('beers-breweries-and-beer-reviews/breweries.csv')
reviews = pd.read_csv('beers-breweries-and-beer-reviews/reviews.csv')

In [4]:
reviews['text'] = reviews['text'].replace(u'\xa0\xa0', '')
# subset to only reviews that have a text review
text_reviews = reviews.loc[reviews['text'] != '']
# subset data to exclude NaN's as well (only losing 164k reviews from the last subset)
text_no_nan = text_reviews.loc[text_reviews.smell.isna() == False]
# rename column name beer_id to id for easy joining
text_no_nan = text_no_nan.rename(columns={'beer_id':'id'})
# subset out retired beers
current_beers = beers.loc[beers['retired'] == 'f']
# merge text_no_nan with beers that are not retired
df = pd.merge(text_no_nan, current_beers, on='id')
# create a table with average ratings for each beer. Index/ID is the beer id
ratings = pd.DataFrame(df.groupby('id')['score'].mean())
# add a column tallying the # of reviews for that beer
ratings['no_of_ratings'] = df.groupby('id')['score'].count()
# subset ratings with only beers that have 10+ ratings
ratings = ratings.loc[ratings['no_of_ratings'] > 9]
# formatting
ratings = ratings.reset_index()
ratings = ratings.rename(columns={'score':'avg_score'})
# rejoin no of ratings onto df
df = df.merge(ratings, how='inner', on='id')
# make a dataframe of reviewers by usename, count the number of reviews they made
reviewers = pd.DataFrame(df.groupby('username')['id'].count())
# make a new feature, the average of all of their scores
reviewers['avg_usr_score'] = df.groupby('username')['score'].mean()
# subset reviewers to those with 5+ reviews. From 73k users to 25k.
reviewers = reviewers.loc[reviewers['id'] > 4] ## MAYBE I CAN PLAY WITH THIS #
# formatting 
reviewers = reviewers.rename(columns={'id':'tot_usr_rvw'})
# there's ~1400 users outsides of 2 STDs of the mean score, will subset them out
reviewers_sub = reviewers.loc[(reviewers['avg_usr_score'] >= 3.182) &\
                              (reviewers['avg_usr_score'] <= 4.665)]

# subset of df with beers that have 10+ reviews, and with reviewers that have 5+ reviews
# and an average rating of beers between 3.18 and 4.67
df_with_mins = df.merge(reviewers_sub, how = 'inner', on = 'username')

In [3]:
import re

In [4]:
cleaned_reviews = reviews[['text']]
cleaned_reviews['text'] = cleaned_reviews['text'].apply(lambda x: re.sub(r'\xa0', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
cleaned_reviews = cleaned_reviews.loc[cleaned_reviews.text != '']

In [11]:
pd.options.display.max_seq_items = 2000

In [13]:
cleaned_reviews.text[7]

' Pours a creamy opaque light straw yellow with a whispy frothy white head. Nose is of citrus skin and light pine. Taste is a thing of beauty! A nice citrus punch in the mouth. Both peeland zest as well as juice of oranges, grapefruit and lemon. Nice malt backbone although minimal presence, you know its there because the hoppyness of this beer is perfectly balanced. Meadium creamy body. Smooth to drink. Finishes with with a Nice pungent citrus bitterness that leaves you craving the next sip to start the process over again. By far my favorite Treehouse brew of the 4 or 5 I’ve had. '

In [16]:
cleaned_reviews.text[9073096]

" pours a hazy, almost cloudy, autumn gold. Apparently unfiltered. Rich, off-white head settles to a tight, solid cap of foam. Aroma is straight up Belgian yeast, with the overripe fruit esters of banana and melon skin, and a hint of spice. Taste has a nice balance of sweet honey malts and tangy, bubblegum-y yeast. Floral, lemony hops come in strong, with a surprising cilantro-like soapy bitterness. Any sweetness begins to yield to dry, crackery malts and tart apple. All that's missing is a nice wedge of brie. Bitterness in the finish lingers a bit long for my taste and is my only real complaint. Nice rustic quality to it that is suggestive of a farmhouse ale or biere de garde. Terrific, weighty mouthfeel for a regular BPA without become heavy or cloying. A brisk carbonation helps in that regard, as does the dry finish. A bit to bitter for the style perhaps, but pair it with some rich cheese, a baguette and some fig preserves and you have a great picnic. "

In [22]:
cleaned_reviews.text[5234643]

' Poured from bottle into a pint glass Appearance – The beer pours a deep brown-mahogany color with a two finger head of tan colored foam. The head has a fantastic level of retention, slowly fading to leave a great level of foamy lace on the sides of the glass. Smell – A rather sweet smell is the main constitute of the nose. Lots of caramel, toffee, and sweet potato are all there and quite large. There is also a little bit of a cocoa and a light herbal and dark fruit smell mixed with the rest. Taste – The taste begins much more roasted and bready and much less sweet then would have been anticipated from the nose. A roasted malt and lightly toasted bread taste greet the tongue with just a bit of caramel and sweet potato sweetness. The sweeter flavors increase as the flavor advances more toward the middle, with some flavors of a caramel and brown sugar nature joining into the taste as well. The sweet potato really kicks it up at the end of the taste and is joined by some herbal and light

In [25]:
cleaned_reviews.text[1186945]

" On-tap at Local Option. A - Pours black, tan head, thin ring, no lace or cap. S - Nice level of roast, chocolate, caramel, and coffee. There's a nice chocolate roasty sweetness that keeps me coming back for me. T - Dark chocolate, roast, coffee beans, oak, burnt sugars. The chocolate and roast once again have me in love with this one. The coffee notes are nice as well with a touch of vanilla. M - Medium body, good carbonation, smooth, rich, creamy finish. D - A really nice porter...I had to get another after my initial pour. Would def have again if it's floating around town...would probably even seek out just to have it again. "

Content Based NLP Stuff:

In [47]:
# This is the combined text for each beer. this skips most preprocessing below. 
df_joined = pd.read_pickle('joined_text_df.pkl')

In [48]:
lookup_df = df_joined[['id', 'brewery_id', 'city', 'state', 'country', 'brewery_name']]
lookup_dict = lookup_df.set_index('id').to_dict(orient='index')

In [129]:
# this is each review cleaned for Doc2Vec
cleaned_reviews_df = pd.read_pickle('cleaned_reviews_df.pkl')

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn.feature_extraction import text
from gensim.parsing.preprocessing import STOPWORDS as stop_words

In [4]:
# subsets reviews df, and then joins all text reviews for each individual beer together
df_joined = df_with_mins.copy()
df_joined['joined_text'] = df_joined.groupby('id')['text'].\
                               transform(lambda x: ''.join(x))

In [5]:
# removes duplicate beers, and subsets to just beer id, joined_text and rating info
# to be cleaned and then joined to beers df
df_joined_sub = df_joined[['id', 'joined_text', 'avg_score', 'no_of_ratings']].drop_duplicates(\
                                                                        subset='id')

In [6]:
# removes \xa0 remove text
df_joined_sub['joined_text'] = df_joined_sub['joined_text'].apply(lambda x: re.sub\
                                                                  (r'\xa0', '', x))

In [19]:
breweries_sub = breweries[['id','name', 'city', 'state', 'country']]
breweries_sub = breweries_sub.rename(columns={'id':'brewery_id', 'name':'brewery_name'})

In [20]:
beers_sub = beers[['id', 'name', 'brewery_id', 'style', 'abv']]

In [21]:
breweries_beer = pd.merge(beers_sub, breweries_sub, on = 'brewery_id')

In [26]:
joined_df = pd.merge(df_joined_sub, breweries_beer, on = 'id')

In [28]:
joined_df.to_pickle("./joined_text_df.pkl")

In [14]:
# make a separate df with each individual review, clean up \xa0's and then pickle
cleaned_reviews = df_with_mins[['id', 'text']]
cleaned_reviews['text'] = cleaned_reviews['text'].apply(lambda x: re.sub(r'\xa0', '', x))
# cleaned_reviews.to_pickle("./cleaned_reviews_df.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
cleaned_reviews.text[0]

' 0% 16 oz can. Funny story: As I finally walked in the doors after a 45 min wait in line and freezing temps the sweet sound of the Grateful Dead\'s Sugar Magnolia greeted me from the TreeHouse sound system. The bottom of the can reads: "Going where the wind goes, bloomin\' like a red rose" A white haze to the yellow and golden liquid. Thick and healthy, totally unfiltered. Brawny white foam cap, thick, all-white clumps. Huge lacing left over. The aroma has a very zesty citrus hop effect, mellon and mango, grainy earthiness, tropical fruit blend with a bitter to sweet effect, then a peppery kick at the end. Very aromatic. The flavor is just bursting with complex hops, zesty earthy tones, sweet orange, peppery malt, clean fresh feel and overall vibe. A crispy bite wakes you up, full and lush mouthfeel follows from a totally unfiltered expereince. The feel and flavor finishes with a fun, earthy, zesty dry bite. Tropical juicy, zesty citrus, zippy golden wheat malt, melons, rustic earthin

In [6]:
count_vect = CountVectorizer(stop_words='english')
counts = count_vect.fit_transform(df_joined_sub.joined_text)
cos_sim = cosine_similarity(counts, counts)

KeyboardInterrupt: 

In [0]:
indices = pd.Series(df_joined_sub.index)

In [29]:
def recommendations(beer_id, df, cos_sim = cos_sim):
    """
    Takes a beer id and cosine similarty matrix in as arguments and returns beers closely related to the input beer
    """
    # initializing the empty list of recommended movies
    recommended_beers = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == beer_id].index[0]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    print(top_10_indexes)
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_beers.append(list(joined_df.name)[i])
        
    return recommended_beers

def tfidf_recs(beer_id, cos_sim = tfidf_cos):
    """
    Takes a beer id and cosine similarty matrix in as arguments and returns beers closely related to the input beer
    """
    # initializing the empty list of recommended movies
    recommended_beers = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == beer_id].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:21].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_beers.append(list(beers_text.name)[i])
        
    return beers_text.name[beer_id], recommended_beers

NameError: name 'cos_sim' is not defined

In [32]:
joined_df.sample()

Unnamed: 0,id,joined_text,avg_score,no_of_ratings,name,brewery_id,style,abv,brewery_name,city,state,country
16302,65010,Consumed 4/13/12 Appearance: Dark caramel/lig...,3.607619,42,Perfect Tin Amber,24488,American Amber / Red Ale,4.5,Tin Roof Brewing Company,Baton Rouge,LA,US


In [172]:
import string
import re

RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)

def preprocess(text):
    ls = LancasterStemmer()
    # Remove all punctuation and make all lowercase 
    return ls.stem(RE_PUNCT.sub(" ", text)).lower().split()

### DOC2VEC

In [1]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec, Phrases
from gensim.parsing.preprocessing import STOPWORDS as stop_words
from gensim.utils import simple_preprocess
from sklearn.feature_extraction import text
from nltk.stem.lancaster import LancasterStemmer

In [2]:
letters = list('abcdefghijklmnopqrstuvwxyz')
numbers = list('0123456789')
words = ['oz', 'ml'] # ADD MORE
stop_words = stop_words.union(set(letters)).union(set(numbers)).union(set(words))
my_stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)

In [72]:
# use Phrases to let the model detect bigrams
# bigram = Phrases(map(preprocess, df_sub.text.tolist()),max_vocab_size=10000000)


In [3]:
def preprocessor(text):
    # uses gensim simple_preprocess and then removes stop words
    simple = simple_preprocess(text)
    result = [word for word in simple if not word in my_stop_words]
    return result

In [4]:
# this takes the document and 'text' as arguments
# makes words lowercase and splits them, and then adds the beer id as tag
# returns as TaggedDocument
def tag_docs(docs):
    results = docs.apply(lambda r: TaggedDocument(words=preprocessor(r['text']), tags=[r['id']]), axis=1)
    return results.tolist()

In [249]:
# subsetting for a test
df_sub = cleaned_reviews_df[:10]

In [8]:
# process reviews, turn into list of TaggedDocument objects, with beer id as tag
tagged_docs = tag_docs(cleaned_reviews_df)

In [9]:
## Use format like this 

model = Doc2Vec(dm=0, dbow_words=1, min_count=4, negative=3,
                hs=0, sample=1e-4, window=5, size=100, workers=8)

model.build_vocab(tagged_docs, progress_per = 100)



In [16]:
from gensim.models.callbacks import CallbackAny2Vec
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [17]:
epoch_logger = EpochLogger()
model.train(tagged_docs, total_examples=model.corpus_count, epochs=6, callbacks=[epoch_logger])

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end


In [138]:
model.similar_by_vector('effervescence')

  """Entry point for launching an IPython kernel.


[('effervescent', 0.8132021427154541),
 ('lively', 0.8083217144012451),
 ('fizziness', 0.7874601483345032),
 ('carbonation', 0.7864266037940979),
 ('effervesence', 0.7799913883209229),
 ('active', 0.777531087398529),
 ('sparkly', 0.7369555234909058),
 ('spritzy', 0.731114387512207),
 ('bubbliness', 0.726111114025116),
 ('liveliness', 0.7169278264045715)]

In [22]:
model.save('d2v-6epoch.pkl')

In [23]:
model.save_word2vec_format('d2v-format', doctag_vec= True, )

In [140]:
# this is the above but trying it to return a dict
def location_filter(ranked_beers, state, city, n):

    located_brewery = {}
    # state = 'CA'
    # city = 'Los Angeles'
    counter = 0

    for beer in ranked_beers:
        if counter < n:
            dict_state = lookup_dict[beer[0]]['state']
            dict_city = lookup_dict[beer[0]]['city']
            brewery_id = lookup_dict[beer[0]]['brewery_id']
            brewery_name = lookup_dict[beer[0]]['brewery_name']
            if (dict_state == state) and (dict_city == city):
        #             print(beer_breweries_lookup[beer[0]])
                print(beer[0])
                if brewery_id in located_brewery:
                    continue
                else:  
                    located_brewery[brewery_id] = brewery_name
                counter += 1
    return located_brewery

In [154]:
vec = model['fruity']
d2v_test = model.docvecs.most_similar([vec], topn=3000)
d2v_test;



In [155]:
location_filter(d2v_test, 'OR', 'Portland', 2)

89445
92787


KeyError: 74560

In [148]:
# storing d2v beer_ids to check if real
d2v_beers = [beer[0] for beer in d2v_test]