In [1]:
import string
import pandas as pd
import numpy as np
import sys; sys.path.append('..')
from beer import descriptors
from tqdm import tqdm

In [2]:
# load review data
df = pd.read_csv('../data/interim/ratebeer.csv')
df['review/text'] = df['review/text'].astype(str)

  interactivity=interactivity, compiler=compiler, result=result)


## Idea
simplest idea first -- train a TFIDF representation of each beer by pooling all of its reviews and getting TFIDF representation of that string. For a given query, generate TFIDF representation and find all beers which are nearest neighbors to the query vector.

## make beer vectors: TF-IDF
using the out-of-the-box TFIDF implementation in sklearn

In [3]:
# pool all of the review text for each beer
rtxt_raw = df.groupby(['beer/beerId','beer/name'])['review/text'].apply(' '.join).reset_index()

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=0.001)

In [14]:
# train a tfidf model, produce vectors for each document
term_doc = tfidf.fit_transform(rtxt_raw['review/text'].values)

In [15]:
term_doc.shape

(110647, 14058)

## index vectors for NN search
using annoy

In [16]:
from annoy import AnnoyIndex

In [17]:
# create an AnnoyIndex with size == len(vocab)
t = AnnoyIndex(term_doc.shape[1])

In [18]:
# add all beers to the index
for doc_index in tqdm(range(term_doc.shape[0])):
    t.add_item(doc_index, term_doc[doc_index].toarray().reshape(-1))

100%|██████████| 110647/110647 [04:02<00:00, 456.73it/s]


In [19]:
t.build(50)

True

In [203]:
# define some convenience lookups
id_to_pop = (1.0-df['beer/beerId'].value_counts()/df['beer/beerId'].value_counts().max()).to_dict()
id_to_name = df[['beer/beerId', 'beer/name']].drop_duplicates().set_index('beer/beerId')['beer/name'].to_dict()

In [207]:
def get_score(tfidf_distance, popularity, weights=[1,1]):
    return weights[0]*tfidf_distance + weights[1]*popularity 

In [208]:
def print_query_results(positive='', negative='', n=5):
    # form the query
    query_vec_p = tfidf.transform([positive]).toarray().reshape(-1)
    query_vec_n = tfidf.transform([negative]).toarray().reshape(-1)
    query_vec = query_vec_p - query_vec_n
    # find the nearest neighbors
    items, distances = t.get_nns_by_vector(query_vec, n=10**6, include_distances=True, search_k=-1)
    # compute the ranking scores
    results = []
    for item_id, tfidf_distance in zip(items, distances):
        beer_id = rtxt_raw.iloc[item_id]['beer/beerId']
        name = id_to_name[beer_id] 
        score = get_score(tfidf_distance, id_to_pop[beer_id])
        results.append((beer_id, name, score))
    for beer_id, name, score in sorted(results, key=lambda s: s[2])[:n]:
        print("{} (score: {})".format(name, score))

In [209]:
print_query_results(positive='stout', negative='', n=20)

Guinness Draught (score: 1.2973963022232056)
Pabst Blue Ribbon (score: 1.4226150430126108)
North Coast Old Rasputin Russian Imperial Stout (score: 1.5031720229557584)
Samuel Smiths Oatmeal Stout (score: 1.5200457381995727)
Dogfish Head 90 Minute Imperial IPA (score: 1.5378611779316165)
Youngs Double Chocolate Stout (score: 1.5637106064594153)
Budweiser (score: 1.5673586919710234)
Sierra Nevada Pale Ale &#40;Bottle&#41; (score: 1.5694155904637785)
Samuel Adams Boston Lager (score: 1.5711392143588045)
Chimay Bleue &#40;Blue&#41; / Grande Réserve (score: 1.5859191195789353)
Victory Storm King Imperial Stout (score: 1.6301245849369925)
Stone Arrogant Bastard Ale (score: 1.6350455877584813)
Newcastle Brown Ale (score: 1.6512364386480092)
Orval (score: 1.652943796409673)
Hoegaarden (score: 1.6599563747257382)
Rogue Shakespeare Oatmeal Stout (score: 1.6822886425695378)
Brooklyn Black Chocolate Stout (score: 1.68809018971084)
St. Bernardus Abt 12 (score: 1.6899902650288174)
Rogue Dead Guy Ale 