In [20]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec, Phrases
from gensim.parsing.preprocessing import STOPWORDS as stop_words
from gensim.utils import simple_preprocess
from sklearn.feature_extraction import text
from nltk.stem.lancaster import LancasterStemmer

In [2]:
# this is each review cleaned for Doc2Vec. It is df_with_mins with only 'id' and 'textb'
cleaned_reviews_df = pd.read_pickle('cleaned_reviews_df.pkl')

In [49]:
# load df with beer/brewery data and create a lookup dictionary
df_joined = pd.read_pickle('joined_text_df.pkl')

In [88]:
lookup_df = df_joined[['id', 'brewery_id', 'name', 'city', 'state', 'country', 'brewery_name']]
lookup_df['id'] = lookup_df['id'].astype(str)
lookup_dict = lookup_df.set_index('id').to_dict(orient='index')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [42]:
letters = list('abcdefghijklmnopqrstuvwxyz')
numbers = list('0123456789')
words = ['oz', 'ml', 'pour', 'poured', 'bottle', 'can', 'ounce',\
         'bomber', 'botttle', 'stubby', 'ouncer', 'pouring', 'growler', 'snifter',\
         'tulip', 'bottled', 'brewery'] # ADD MORE
stop_words = stop_words.union(set(letters)).union(set(numbers)).union(set(words))
my_stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)

def tag_docs(docs):
    results = docs.apply(lambda r: TaggedDocument(words=preprocessor(r['text']), tags=[str(r['id'])]), axis=1)
    return results.tolist()
def preprocessor(text):
    # uses gensim simple_preprocess and then removes stop words
    simple = simple_preprocess(text)
    result = [word for word in simple if not word in my_stop_words]
    return result
def preprocessor_and_stem(text):
    # uses gensim simple_preprocess and then removes stop words
    ls = LancasterStemmer()
    simple = simple_preprocess(text)
    result = [ls.stem(word) for word in simple if not word in my_stop_words]
    return result

In [38]:
# cleaned_sub = cleaned_reviews_df[:100]

In [43]:
# process reviews, turn into list of TaggedDocument objects, with beer id as tag
tagged_docs = tag_docs(cleaned_reviews_df)

In [44]:
## Use format like this 

model = Doc2Vec(dm=0, dbow_words=1, min_count=4, negative=3,
                hs=0, sample=1e-4, window=5, size=100, workers=8)

model.build_vocab(tagged_docs, progress_per = 100)



In [89]:
from gensim.models.callbacks import CallbackAny2Vec
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1
        
# this is the above but trying it to return a dict
def location_filter(ranked_beers, state, city, n):

    located_brewery = {}
    # state = 'CA'
    # city = 'Los Angeles'
    counter = 0

    for beer in ranked_beers:
        if counter < n:
            dict_state = lookup_dict[beer[0]]['state']
            dict_city = lookup_dict[beer[0]]['city']
            brewery_id = lookup_dict[beer[0]]['brewery_id']
            brewery_name = lookup_dict[beer[0]]['brewery_name']
            beer_name = lookup_dict[beer[0]]['name']
            if (dict_state == state) and (dict_city == city):
        #             print(beer_breweries_lookup[beer[0]])
#                 print(beer[0])
                if brewery_id in located_brewery:
                    continue
                else:  
                    located_brewery[brewery_id] = (brewery_name, beer[0], beer_name)
                counter += 1
    return located_brewery

In [48]:
epoch_logger = EpochLogger()
model.train(tagged_docs, total_examples=model.corpus_count, epochs=1, callbacks=[epoch_logger])

Epoch #0 start
Epoch #0 end


In [167]:
vec = model['red']
d2v_test = model.docvecs.most_similar([vec], topn=30000)
location_filter(d2v_test, 'BC', 'Vancouver', 6)

{3449: ('Steamworks Brewing Company', '24439', 'Raspberry Frambozen'),
 29378: ('Parallel 49 Brewing Company', '83758', 'Ruby Tears'),
 780: ('Granville Island Brewery', '71388', 'False Creek Raspberry Ale'),
 673: ('R & B Brewing Co.', '10519', 'Red Devil Pale Ale'),
 35805: ('Postmark Brewing', '155946', 'West Coast Pale Ale'),
 30578: ('Powell Street Craft Brewery', '88513', 'Old Jalopy Pale Ale')}

In [160]:
for doc in d2v_test:
    if doc[0] == '39421':
        print(doc)

('39421', 0.49305686354637146)


In [161]:
d2v_test

[('5800', 0.7610883116722107),
 ('315814', 0.7560629844665527),
 ('93399', 0.746173083782196),
 ('250690', 0.7457935810089111),
 ('144127', 0.744695782661438),
 ('90806', 0.7408261299133301),
 ('107152', 0.7374526262283325),
 ('235479', 0.734028697013855),
 ('312942', 0.7329654097557068),
 ('15189', 0.731919527053833),
 ('12279', 0.7298052310943604),
 ('205893', 0.7295321822166443),
 ('53772', 0.727426290512085),
 ('76043', 0.7273325324058533),
 ('124805', 0.7261089086532593),
 ('204957', 0.7258068919181824),
 ('7290', 0.7242791652679443),
 ('182931', 0.7235300540924072),
 ('338794', 0.7224483489990234),
 ('208405', 0.721636176109314),
 ('23330', 0.7212944030761719),
 ('172799', 0.7197797298431396),
 ('157525', 0.7153183221817017),
 ('170257', 0.714976966381073),
 ('172088', 0.7143881320953369),
 ('90671', 0.7122817635536194),
 ('234037', 0.7091912031173706),
 ('19002', 0.7082751989364624),
 ('60060', 0.708182692527771),
 ('222192', 0.7075859308242798),
 ('216617', 0.7072275280952454),