In [1]:
import string
import pandas as pd
import numpy as np
import sys; sys.path.append('..')
from beer import descriptors
from tqdm import tqdm

In [2]:
%matplotlib inline

In [3]:
# load review data
df = pd.read_csv('../data/interim/ratebeer.csv')
df['review/text'] = df['review/text'].astype(str)

  interactivity=interactivity, compiler=compiler, result=result)


## Limit Results to 2500 most popular beers (by review count)

In [4]:
# how many beers make up 50% of all reviews
top_beers = df['beer/beerId'].value_counts().iloc[:2500].reset_index()
top_beers.columns = ['beer/beerId', 'review_count']
print(top_beers.shape[0])

2500


In [5]:
df_top = pd.merge(df, top_beers, how='right', on='beer/beerId')

## Idea
simplest idea first -- train a TFIDF representation of each beer by pooling all of its reviews and getting TFIDF representation of that string. For a given query, generate TFIDF representation and find all beers which are nearest neighbors to the query vector.

## make beer vectors: TF-IDF
using the out-of-the-box TFIDF implementation in sklearn

In [6]:
# pool all of the review text for each beer
rtxt_raw = df_top.groupby(['beer/beerId','beer/name'])['review/text'].apply(' '.join).reset_index()

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=0.001)

In [8]:
# train a tfidf model, produce vectors for each document
term_doc = tfidf.fit_transform(rtxt_raw['review/text'].values)

In [9]:
term_doc.shape

(2500, 75801)

## load word vectors for query expansion

In [27]:
from gensim.models import KeyedVectors
beerword_model= KeyedVectors.load_word2vec_format('../data/interim/ftmodel.vec')

In [146]:
# define a query expansion function
def expand_query(query, topn=10):
    expanded = []
    # query is a space-delimited string
    for q in query.split(' '):
        # add the original query term
        if q in tfidf.vocabulary_.keys():
            expanded.append(q)
        # add any similar terms found by the word similarity model
        if q in beerword_model.vocab.keys():
            for word, score in beerword_model.similar_by_word(q, topn=10*topn):
                if word in tfidf.vocabulary_.keys() and counter < topn:
                    expanded.append(word)
    # join all and return
    return ' '.join(expanded[:topn])

## index vectors for NN search
using annoy

In [138]:
from annoy import AnnoyIndex

In [139]:
# create an AnnoyIndex with size == len(vocab)
t = AnnoyIndex(term_doc.shape[1])

In [140]:
# add all beers to the index
for doc_index in tqdm(range(term_doc.shape[0])):
    t.add_item(doc_index, term_doc[doc_index].toarray().reshape(-1))

100%|██████████| 2500/2500 [00:19<00:00, 125.70it/s]


In [141]:
t.build(100)

True

In [142]:
# define some convenience lookups
id_to_name = df[['beer/beerId', 'beer/name']].drop_duplicates().set_index('beer/beerId')['beer/name'].to_dict()

In [147]:
def print_query_results(positive='', negative='', n=5):
    # first expand the queries
    pos_expanded = expand_query(positive)
    neg_expanded = expand_query(negative)
    # form positive query vectors via tfidf
    query_vec_p = tfidf.transform([pos_expanded]).toarray().reshape(-1)
    # eliminate negative terms by setting their components to zero in the query
    query_vec = [0.0 if q in neg_expanded.split(' ') else q for q in query_vec_p]
    # find the nearest neighbors
    items, distances = t.get_nns_by_vector(query_vec, n=term_doc.shape[0], include_distances=True, search_k=-1)
    # print the results
    for item_id, tfidf_distance in zip(items[:n], distances[:n]):
        beer_id = rtxt_raw.iloc[item_id]['beer/beerId']
        name = id_to_name[beer_id] 
        score = tfidf_distance
        print("{} (score: {})".format(name, score))

In [186]:
print_query_results(positive='sugar', negative='', n=20)

Sly Fox Ichor (score: 1.4031444787979126)
Southampton Abbot 12 (score: 1.4045695066452026)
Furthermore Makeweight Triple Pale (score: 1.405379056930542)
Chapeau Faro (score: 1.4056349992752075)
Lagunitas Brown Shugga (score: 1.4057786464691162)
Avery The Reverend (score: 1.4061038494110107)
Smuttynose Gravitation (score: 1.4061509370803833)
21st Amendment Monks Blood (score: 1.4061821699142456)
Boulevard Sixth Glass Quadrupel (score: 1.4063763618469238)
Sierra Nevada Ovila Dubbel (score: 1.4064805507659912)
North Coast Cru dOr &#40;Whole Foods&#41; (score: 1.406522274017334)
Allagash Four (score: 1.406699538230896)
Lost Abbey Lost and Found (score: 1.40676748752594)
Flying Dog Kerberos Tripel (score: 1.406944751739502)
AleSmith Horny Devil (score: 1.4070098400115967)
Dieu du Ciel Rigor Mortis Abt (score: 1.4070795774459839)
Goose Island Pere Jacques (score: 1.4071314334869385)
Heavy Seas Holy Sheet (score: 1.4072580337524414)
Three Floyds Alpha Kong (score: 1.4073426723480225)
Avery Co