In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
beers = pd.read_csv('beers-breweries-and-beer-reviews/beers.csv')
breweries = pd.read_csv('beers-breweries-and-beer-reviews/breweries.csv')
reviews = pd.read_csv('beers-breweries-and-beer-reviews/reviews.csv')

In [4]:
reviews['text'] = reviews['text'].replace(u'\xa0\xa0', '')
# subset to only reviews that have a text review
text_reviews = reviews.loc[reviews['text'] != '']
# subset data to exclude NaN's as well (only losing 164k reviews from the last subset)
text_no_nan = text_reviews.loc[text_reviews.smell.isna() == False]
# rename column name beer_id to id for easy joining
text_no_nan = text_no_nan.rename(columns={'beer_id':'id'})
# subset out retired beers
current_beers = beers.loc[beers['retired'] == 'f']
# merge text_no_nan with beers that are not retired
df = pd.merge(text_no_nan, current_beers, on='id')
# create a table with average ratings for each beer. Index/ID is the beer id
ratings = pd.DataFrame(df.groupby('id')['score'].mean())
# add a column tallying the # of reviews for that beer
ratings['no_of_ratings'] = df.groupby('id')['score'].count()
# subset ratings with only beers that have 10+ ratings
ratings = ratings.loc[ratings['no_of_ratings'] > 9]
# formatting
ratings = ratings.reset_index()
ratings = ratings.rename(columns={'score':'avg_score'})
# rejoin no of ratings onto df
df = df.merge(ratings, how='inner', on='id')
# make a dataframe of reviewers by usename, count the number of reviews they made
reviewers = pd.DataFrame(df.groupby('username')['id'].count())
# make a new feature, the average of all of their scores
reviewers['avg_usr_score'] = df.groupby('username')['score'].mean()
# subset reviewers to those with 5+ reviews. From 73k users to 25k.
reviewers = reviewers.loc[reviewers['id'] > 4] ## MAYBE I CAN PLAY WITH THIS #
# formatting 
reviewers = reviewers.rename(columns={'id':'tot_usr_rvw'})
# there's ~1400 users outsides of 2 STDs of the mean score, will subset them out
reviewers_sub = reviewers.loc[(reviewers['avg_usr_score'] >= 3.182) &\
                              (reviewers['avg_usr_score'] <= 4.665)]

# subset of df with beers that have 10+ reviews, and with reviewers that have 5+ reviews
# and an average rating of beers between 3.18 and 4.67
df_with_mins = df.merge(reviewers_sub, how = 'inner', on = 'username')

Content Based NLP Stuff:

In [8]:
# This is the combined text for each beer. this skips most preprocessing below. 
df_joined_sub = pd.read_pickle('joined_text_df.pkl')

In [4]:
cleaned_reviews_df = pd.read_pickle('cleaned_reviews_df.pkl')

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn.feature_extraction import text
from gensim.parsing.preprocessing import STOPWORDS as stop_words

In [4]:
# subsets reviews df, and then joins all text reviews for each individual beer together
df_joined = df_with_mins.copy()
df_joined['joined_text'] = df_joined.groupby('id')['text'].\
                               transform(lambda x: ''.join(x))

In [5]:
# removes duplicate beers, and subsets to just beer id, joined_text and rating info
# to be cleaned and then joined to beers df
df_joined_sub = df_joined[['id', 'joined_text', 'avg_score', 'no_of_ratings']].drop_duplicates(\
                                                                        subset='id')

In [6]:
# removes \xa0 remove text
df_joined_sub['joined_text'] = df_joined_sub['joined_text'].apply(lambda x: re.sub\
                                                                  (r'\xa0', '', x))

In [19]:
breweries_sub = breweries[['id','name', 'city', 'state', 'country']]
breweries_sub = breweries_sub.rename(columns={'id':'brewery_id', 'name':'brewery_name'})

In [20]:
beers_sub = beers[['id', 'name', 'brewery_id', 'style', 'abv']]

In [21]:
breweries_beer = pd.merge(beers_sub, breweries_sub, on = 'brewery_id')

In [26]:
joined_df = pd.merge(df_joined_sub, breweries_beer, on = 'id')

In [28]:
joined_df.to_pickle("./joined_text_df.pkl")

In [14]:
# make a separate df with each individual review, clean up \xa0's and then pickle
cleaned_reviews = df_with_mins[['id', 'text']]
cleaned_reviews['text'] = cleaned_reviews['text'].apply(lambda x: re.sub(r'\xa0', '', x))
# cleaned_reviews.to_pickle("./cleaned_reviews_df.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
cleaned_reviews.text[0]

' 0% 16 oz can. Funny story: As I finally walked in the doors after a 45 min wait in line and freezing temps the sweet sound of the Grateful Dead\'s Sugar Magnolia greeted me from the TreeHouse sound system. The bottom of the can reads: "Going where the wind goes, bloomin\' like a red rose" A white haze to the yellow and golden liquid. Thick and healthy, totally unfiltered. Brawny white foam cap, thick, all-white clumps. Huge lacing left over. The aroma has a very zesty citrus hop effect, mellon and mango, grainy earthiness, tropical fruit blend with a bitter to sweet effect, then a peppery kick at the end. Very aromatic. The flavor is just bursting with complex hops, zesty earthy tones, sweet orange, peppery malt, clean fresh feel and overall vibe. A crispy bite wakes you up, full and lush mouthfeel follows from a totally unfiltered expereince. The feel and flavor finishes with a fun, earthy, zesty dry bite. Tropical juicy, zesty citrus, zippy golden wheat malt, melons, rustic earthin

In [6]:
count_vect = CountVectorizer(stop_words='english')
counts = count_vect.fit_transform(df_joined_sub.joined_text)
cos_sim = cosine_similarity(counts, counts)

KeyboardInterrupt: 

In [0]:
indices = pd.Series(df_joined_sub.index)

In [29]:
def recommendations(beer_id, df, cos_sim = cos_sim):
    """
    Takes a beer id and cosine similarty matrix in as arguments and returns beers closely related to the input beer
    """
    # initializing the empty list of recommended movies
    recommended_beers = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == beer_id].index[0]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    print(top_10_indexes)
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_beers.append(list(joined_df.name)[i])
        
    return recommended_beers

def tfidf_recs(beer_id, cos_sim = tfidf_cos):
    """
    Takes a beer id and cosine similarty matrix in as arguments and returns beers closely related to the input beer
    """
    # initializing the empty list of recommended movies
    recommended_beers = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == beer_id].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:21].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_beers.append(list(beers_text.name)[i])
        
    return beers_text.name[beer_id], recommended_beers

NameError: name 'cos_sim' is not defined

In [32]:
joined_df.sample()

Unnamed: 0,id,joined_text,avg_score,no_of_ratings,name,brewery_id,style,abv,brewery_name,city,state,country
16302,65010,Consumed 4/13/12 Appearance: Dark caramel/lig...,3.607619,42,Perfect Tin Amber,24488,American Amber / Red Ale,4.5,Tin Roof Brewing Company,Baton Rouge,LA,US


In [37]:
import string
import re

RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)

def preprocess(text):
    # Remove all punctuation and make all lowercase 
    return RE_PUNCT.sub(" ", text).lower().split()

### DOC2VEC

In [9]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec, Phrases
from gensim.parsing.preprocessing import STOPWORDS as stop_words
from gensim.utils import simple_preprocess
from sklearn.feature_extraction import text

In [31]:
letters = list('abcdefghijklmnopqrstuvwxyz')
numbers = list('0123456789')

stop_words = stop_words.union(set(letters)).union(set(numbers))
my_stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)

In [86]:
# this isn't using the right stop words
def tag_docs(docs, col):
    tagged = docs.apply(lambda r: TaggedDocument(words=simple_preprocess(r[col]), tags=str(r['id'])), axis=1)
    return tagged

In [87]:
df_sub = cleaned_reviews_df[:500]

In [90]:
tagged = tag_docs(cleaned_reviews_df, 'text')

In [91]:
tagged.to_pickle('tagged_docs.pkl')

In [76]:
tagged_test = tagged.apply(lambda x: str(x.tags))

AttributeError: 'TaggedDocument' object has no attribute 'apply'

In [92]:
## Use format like this 

model = Doc2Vec(dm=0, dbow_words=1, min_count=4, negative=3,
                hs=0, sample=1e-3, window=5, size=100, workers=8)

model.build_vocab(tagged)



KeyboardInterrupt: 

In [None]:
model.train()