In [52]:
import nltk
import pandas as pd
import spacy

In [4]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [53]:
from textblob import TextBlob

In [54]:
nlp = spacy.load('en_core_web_sm')
df = pd.read_csv('raw_yelp_review_data.csv')

In [55]:
df.columns = ['name', 'review', 'rating']
df.head()

Unnamed: 0,name,review,rating
0,The Factory - Cafe With a Soul,11/25/2016 1 check-in Love love loved the atm...,5.0 star rating
1,The Factory - Cafe With a Soul,"12/2/2016 Listed in Date Night: Austin, Ambia...",4.0 star rating
2,The Factory - Cafe With a Soul,11/30/2016 1 check-in Listed in Brunch Spots ...,4.0 star rating
3,The Factory - Cafe With a Soul,11/25/2016 Very cool decor! Good drinks Nice ...,2.0 star rating
4,The Factory - Cafe With a Soul,12/3/2016 1 check-in They are located within ...,4.0 star rating


In [56]:
rating = []
for s in df['rating'].tolist():
    inter = s.split()
    rating.append(int(float(inter[0])))
print(len(rating), rating[:3])

7616 [5, 4, 4]


In [57]:
df['rating'] = rating
df.head()

Unnamed: 0,name,review,rating
0,The Factory - Cafe With a Soul,11/25/2016 1 check-in Love love loved the atm...,5
1,The Factory - Cafe With a Soul,"12/2/2016 Listed in Date Night: Austin, Ambia...",4
2,The Factory - Cafe With a Soul,11/30/2016 1 check-in Listed in Brunch Spots ...,4
3,The Factory - Cafe With a Soul,11/25/2016 Very cool decor! Good drinks Nice ...,2
4,The Factory - Cafe With a Soul,12/3/2016 1 check-in They are located within ...,4


In [58]:
df['review'][0]

' 11/25/2016 1 check-in Love love loved the atmosphere! Every corner of the coffee shop had its own style, and there were swings!!! I ordered the matcha latte, and it was muy fantastico! Ordering and getting my drink were pretty streamlined. I ordered on an iPad, which included all beverage selections that ranged from coffee to wine, desired level of sweetness, and a checkout system. I got my latte within minutes!  I was hoping for a typical heart or feather on my latte, but found myself listing out all the possibilities of what the art may be. Any ideas? '

In [59]:
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df['review'] = df.review.map(alphanumeric).map(punc_lower)
df.head()

Unnamed: 0,name,review,rating
0,The Factory - Cafe With a Soul,check in love love loved the atmosphe...,5
1,The Factory - Cafe With a Soul,listed in date night austin ambiance ...,4
2,The Factory - Cafe With a Soul,check in listed in brunch spots i lov...,4
3,The Factory - Cafe With a Soul,very cool decor good drinks nice seati...,2
4,The Factory - Cafe With a Soul,check in they are located within the ...,4


In [60]:
docs = list(nlp.pipe(df.review))
df['spacy_doc'] = docs

In [61]:
docs_clean = [[w.lemma_.lower() for w in doc if (not w.is_stop and not w.is_punct and not w.like_num) \
               or (w.lemma_=='not')] for doc in docs]
df['docs_clean'] = docs_clean

In [62]:
docs_list_clean = [' '.join(doc) for doc in docs_clean]
docs_list_clean[0]

'          check love love love atmosphere   corner coffee shop style   swing     order matcha latte   muy fantastico   order get drink pretty streamlined   order ipad   include beverage selection range coffee wine   desire level sweetness   checkout system   get latte minute    hope typical heart feather latte   find list possibility art   idea  '

In [63]:
df.head()

Unnamed: 0,name,review,rating,spacy_doc,docs_clean
0,The Factory - Cafe With a Soul,check in love love loved the atmosphe...,5,"( , check, in, love, love, loved, the,...","[ , check, love, love, love, atmospher..."
1,The Factory - Cafe With a Soul,listed in date night austin ambiance ...,4,"( , listed, in, date, night, , austin, ...","[ , list, date, night, , austin, , amb..."
2,The Factory - Cafe With a Soul,check in listed in brunch spots i lov...,4,"( , check, in, listed, in, brunch, spo...","[ , check, list, brunch, spot, love, e..."
3,The Factory - Cafe With a Soul,very cool decor good drinks nice seati...,2,"( , very, cool, decor, , good, drinks, ...","[ , cool, decor, , good, drink, nice, s..."
4,The Factory - Cafe With a Soul,check in they are located within the ...,4,"( , check, in, they, are, located, wit...","[ , check, locate, northcross, mall, s..."


In [64]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_list_clean)

df_X = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())
df_X.shape

(7616, 12983)

In [65]:
from sklearn.decomposition import TruncatedSVD, NMF

num_topics = 5
topics = TruncatedSVD(num_topics)
doc_topic = topics.fit_transform(X)
topics.explained_variance_ratio_

array([0.00407378, 0.00683991, 0.0057489 , 0.00551294, 0.00499191])

In [66]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [67]:
display_topics(topics, cv.get_feature_names(), 10)


Topic  0
coffee, place, good, great, check, like, love, shop, come, drink

Topic  1
gelato, flavor, cream, try, chocolate, tea, sweet, ice, puff, order

Topic  2
great, gelato, breakfast, place, staff, friendly, food, taco, love, awesome

Topic  3
coffee, gelato, shop, flavor, austin, cup, ice, favorite, owner, cream

Topic  4
breakfast, taco, latte, coffee, moon, delicious, milk, morning, egg, order


##### Not enough distinction across topics, will need to go back and adjust preprocessing

In [17]:
from nltk.util import ngrams
twograms = list(ngrams(docs_list_clean,2)) 

In [70]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv2 = TfidfVectorizer(stop_words='english', max_features=1000, min_df=5, max_df = .5, ngram_range = (1,2))
X2 = cv2.fit_transform(docs_list_clean)

df_X2 = pd.DataFrame(X2.toarray(), columns=cv2.get_feature_names())
df_X2.shape

(7616, 1000)

In [71]:
from sklearn.decomposition import TruncatedSVD, NMF

num_topics = 8
topics = TruncatedSVD(num_topics)
doc_topic = topics.fit_transform(X2)
topics.explained_variance_ratio_

array([0.00415813, 0.00981647, 0.00850132, 0.00804657, 0.00721237,
       0.0069782 , 0.00636739, 0.00615118])

In [72]:
display_topics(topics, cv2.get_feature_names(), 10)


Topic  0
good, great, like, check, come, love, shop, drink, time, austin

Topic  1
gelato, flavor, cream, try, ice, chocolate, sweet, taste, ice cream, like

Topic  2
great, gelato, breakfast, taco, breakfast taco, great coffee, love, staff, friendly, flavor

Topic  3
breakfast, taco, breakfast taco, food, order, tea, wait, egg, sandwich, toast

Topic  4
latte, taco, shop, coffee shop, breakfast, breakfast taco, moon, milk, ice, vanilla

Topic  5
tea, great, latte, love, milk, cream, chai, drink, sweet, chicken

Topic  6
good, good coffee, pretty, nice, coffee good, latte, study, parking, lot, pretty good

Topic  7
shop, coffee shop, tea, cream, waffle, chicken, favorite, sandwich, breakfast, puff


In [77]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv2 = TfidfVectorizer(stop_words='english', max_features=1000, min_df=10, max_df = .5, ngram_range = (1,2))
X2 = cv2.fit_transform(docs_list_clean)

df_X2 = pd.DataFrame(X2.toarray(), columns=cv2.get_feature_names())
df_X2.shape

(7616, 1000)

In [33]:
display_topics(topics, cv2.get_feature_names(), 10)


Topic  0
good, great, like, check coffee, come coffee, love, shop, drink, time, austin

Topic  1
gelato, flavor, cream puff, try, chocolate chip, ice, sweet, taste, ice cream, like

Topic  2
great, gelato, breakfast, taco, breakfast taco, great coffee, staff, friendly, love, flavor

Topic  3
breakfast, taco, breakfast taco, organic, food, tea, wait, egg, satisfy, toast

Topic  4
taco, latte, shop, coffee spot, breakfast, breakfast taco, moon milk, milk, vanilla, ice

Topic  5
tea, great, latte, milk, love, chai, ice, drink, cream puff, sweet

Topic  6
good, good coffee, pretty good, nice place, coffee great, parking lot, study, seating, lot, cream puff

Topic  7
good, great, good coffee, cup coffee, brew, espresso, latte, service, cold brew, barista


topic 6 could be about the physical shop location. topic 7 could be about the actual coffee

In [92]:
#Testing to see what 2 topics looks like with the updated hyper parameters

from sklearn.decomposition import TruncatedSVD, NMF

num_topics = 8
topics = TruncatedSVD(num_topics)
doc_topic = topics.fit_transform(X2)
topics.explained_variance_ratio_

array([0.00423145, 0.00922652, 0.00817088, 0.00735101, 0.00694145,
       0.00643069, 0.00621936, 0.00581451])

In [39]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv2 = TfidfVectorizer(stop_words='english', max_features=1000, min_df=5, max_df = .1, ngram_range = (1,2))
X2 = cv2.fit_transform(docs_list_clean)

df_X2 = pd.DataFrame(X2.toarray(), columns=cv2.get_feature_names())
df_X2.shape

(7616, 1000)

In [40]:
display_topics(topics, cv2.get_feature_names(), 10)


Topic  0
breakfast, taco, ice, espresso, cool, study, gelato, cup, enjoy, area

Topic  1
gelato, cream, ice cream, ice, chocolate, italy, pacha, puff, good food, item


In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
cv2 = TfidfVectorizer(stop_words='english', max_features=1000, min_df=10, max_df = .5, ngram_range = (1,2))
X2 = cv2.fit_transform(docs_list_clean)

df_X2 = pd.DataFrame(X2.toarray(), columns=cv2.get_feature_names())
df_X2.shape

(7616, 1000)

In [37]:
display_topics(topics, cv2.get_feature_names(), 10)


Topic  0
glad, good thing, lemonade, chance, coffeehouse, loud, section, don think, thank, authentic

Topic  1
fun, figure, cover, truck, chicken waffle, houndstooth, super friendly, syrup, house, lemonade

Topic  2
good thing, fun, bottle, surprise, bread, goodness, soy, french toast, loud, figure

Topic  3
bottle, surprise, bread, oh, flavorful, tap, visit austin, easily, run, tip

Topic  4
surprise, lake, section, coffee flavor, bottle, bread, moment, mention, usual, houndstooth

Topic  5
tap, good thing, lake, mention, loud, cashier, houndstooth, don think, cover, super friendly

Topic  6
glad, glass, porch, near, coffee bean, owner, stop, sample, long, cover

Topic  7
glad, good thing, glass, cream puff, break, enjoy, lake, scone, coffee spot, bartender


In [87]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
stop_words = ENGLISH_STOP_WORDS.union(['coffee', 'gelato', 'good'])

In [91]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
cv2 = TfidfVectorizer(stop_words=stop_words, max_features=1000, min_df=10, max_df = .5, ngram_range = (1,3))
X2 = cv2.fit_transform(docs_list_clean)

df_X2 = pd.DataFrame(X2.toarray(), columns=cv2.get_feature_names())
df_X2.shape

(7616, 1000)

In [93]:
display_topics(topics, cv2.get_feature_names(), 10)


Topic  0
great, like, check, come, love, shop, drink, time, austin, latte

Topic  1
great, staff, friendly, taco, breakfast, great place, atmosphere, breakfast taco, beer, food

Topic  2
breakfast, taco, breakfast taco, delicious, sandwich, egg, food, cream, love, order

Topic  3
great, cream, love, ice, latte, tea, friendly, flavor, puff, staff

Topic  4
latte, taco, breakfast taco, shop, moon, milk, breakfast, barista, espresso, vanilla

Topic  5
tea, latte, chai, chicken, food, milk, nice, study, waffle, table

Topic  6
cream, puff, taco, flavor, cream puff, breakfast, ice cream, lot, breakfast taco, parking

Topic  7
great, brew, tea, cold, cold brew, service, cream, like, great place, taco


In [97]:
from sklearn.decomposition import NMF

In [106]:
nmf_model = NMF(8)
nmf_doc_topic = nmf_model.fit_transform(X2)
nmf_doc_topic.shape



(7616, 8)

In [103]:
display_topics(nmf_model, cv2.get_feature_names(), 10)


Topic  0
like, time, come, work, people, don, shop, order, drink, ve

Topic  1
great, great place, atmosphere, service, check great, food, check, beer, place great, spot

Topic  2
taco, breakfast, breakfast taco, egg, morning, sandwich, food, cheese, bacon, delicious

Topic  3
love, friendly, staff, super, love place, friendly staff, austin, shop, staff friendly, super friendly

Topic  4
latte, milk, moon, vanilla, chai, sweet, drink, iced, vanilla latte, ice

Topic  5
tea, chicken, boba, waffle, milk tea, chai, toast, green, food, milk

Topic  6
cream, flavor, puff, ice, ice cream, chocolate, cream puff, cookie, try, sweet

Topic  7
brew, cold, cold brew, check, strong, ice, iced, tap, stop, day


In [104]:
stop_words = ENGLISH_STOP_WORDS.union(['coffee', 'gelato', 'good', 'great', 'like'])

In [105]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
cv2 = TfidfVectorizer(stop_words=stop_words, max_features=1000, min_df=10, max_df = .4, ngram_range = (1,3))
X2 = cv2.fit_transform(docs_list_clean)

df_X2 = pd.DataFrame(X2.toarray(), columns=cv2.get_feature_names())
df_X2.shape

(7616, 1000)

In [106]:
nmf_model = NMF(8)
nmf_doc_topic = nmf_model.fit_transform(X2)
nmf_doc_topic.shape



(7616, 8)

In [107]:
display_topics(nmf_model, cv2.get_feature_names(), 10)


Topic  0
work, shop, lot, nice, parking, table, seating, seat, music, spot

Topic  1
taco, breakfast, breakfast taco, egg, morning, sandwich, food, cheese, bacon, deli

Topic  2
time, order, come, drink, ve, service, wait, don, know, try

Topic  3
latte, milk, moon, vanilla, chai, sweet, iced, vanilla latte, drink, ice

Topic  4
love, friendly, staff, super, love place, atmosphere, friendly staff, awesome, staff friendly, delicious

Topic  5
tea, chicken, boba, milk tea, chai, waffle, green, chai tea, milk, toast

Topic  6
cream, flavor, puff, ice, ice cream, chocolate, cream puff, cookie, try, sweet

Topic  7
brew, cold, cold brew, strong, ice, iced, tap, day, stop, cup


given the clarity of topics present in this cell, I will use this NMF model's doc/topic matrix to produce a recommendation system. While some of the topics overlap slightly, in general we see key categories for determining the excellence of a coffee shop such as location, breakfast food, service, hot tea, hot coffee, sweet treats, and iced coffee

In [216]:
new_df = pd.DataFrame(nmf_doc_topic)
new_df.columns = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7']

In [262]:
#in earlier copy, a suggestion output showed that summermoon coffee bar existed as 2 different names, fixing this here
df.name.replace(to_replace = 'Summer Moon Coffee Bar ', value='Summermoon Coffee Bar ', inplace=True)

In [263]:
new_df[['review', 'rating', 'name']] = df[['review', 'rating', 'name']]
new_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,review,rating,name
0,0.003408,0.0,0.008788,0.097035,0.052656,0.00647,0.0,0.0,check in love love loved the atmosphe...,5,The Factory - Cafe With a Soul
1,0.034141,0.000535,0.0,0.001177,0.032197,0.016309,0.018436,0.0,listed in date night austin ambiance ...,4,The Factory - Cafe With a Soul
2,0.027451,0.0,0.010269,0.0,0.000166,0.077941,0.026688,0.050174,check in listed in brunch spots i lov...,4,The Factory - Cafe With a Soul
3,0.050254,0.000282,0.0,0.042862,0.0,0.011425,0.0,0.002755,very cool decor good drinks nice seati...,2,The Factory - Cafe With a Soul
4,0.022367,0.003361,0.021874,0.0,0.017531,0.046071,0.012401,0.0,check in they are located within the ...,4,The Factory - Cafe With a Soul


In [222]:
new_df.sort_values('topic_0', ascending=False)[:3]

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,review,rating,name
6439,0.121732,0.000957,0.006336,0.0,0.003627,0.0,0.0,0.0,hour laid back south austin coffee sh...,4,"Strange Brew, Austin Coffee"
2566,0.106321,0.0,0.0,0.0,0.006053,0.0,0.0,0.045736,check ins the location is pretty clos...,5,Flightpath Coffeehouse
4061,0.103687,0.0,0.004249,0.0,0.0,0.0,0.002202,0.002145,check ins this space is beautiful an...,4,Seventh Flag Coffee Co


In [220]:
df.review[6439]

'         hour laid back south austin coffee shop with live music  more parking and room inside and outside than epoch on north lopp with a similar vibe  though the clientele skews older  and the parking lot fills up when they have a popular act on the lounge side  so expect to park next door at bake house on weekend nights  the lounge side features live music and usually charges a cover  but the coffee shop side includes   seating areas to chat with friends  play games and do work with plugs along the walls  and a quiet study room  there is free wifi  but it s not the most reliable  they sell coffee  espresso drinks  italian soda  bottled juices  beer   wine until midnight  and food includes paninis  premodern salad or quiche  pie by the slice and pastries   bagels  the staff is excellent but there is only one register to order so the line can get long  we love to come to work on our laptops outside in the back  which features picnic tables with umbrellas and a canopy and a large fan 

In [223]:
new_df.sort_values('topic_1', ascending=False)[:3]

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,review,rating,name
7317,0.0,0.24731,0.0,0.0,0.0,0.0,0.0,0.0,good breakfast tacos and decent coffee...,3,Live Oak Market
3644,0.000635,0.246662,0.0,0.0,0.005781,0.0,0.0,0.0,everything i have had here is amazing ...,5,Cenote
1599,0.0,0.246372,0.0,0.0,0.0,0.0,0.0,0.0,beware of their breakfast tacos they ...,3,Summer Moon Coffee Bar


In [224]:
df.review[7317]

'       good breakfast tacos  and decent coffee   i dunno what else to say  '

In [225]:
df.review[3664]

'       dedicated baristas  never a bad latte  delicious food  instagram   picturesque  austin vibe  colorful and all around awesome at all hours of the  day  '

In [226]:
new_df.sort_values('topic_2', ascending=False)[:3]

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,review,rating,name
6187,0.0,0.001797,0.124991,0.0,0.0,0.0,0.0,0.0,check in listed in reviews in do...,2,Stinson’s
578,0.0,0.0,0.122489,0.0,0.0,0.0,0.0,0.0,i am a resident at imt and i like to st...,2,Apanas Coffee & Beer
2929,0.006128,0.0,0.11988,0.006582,0.0,0.0,0.0,0.0,footnotes to my terrible experience cr...,1,Thunderbird Coffee


In [227]:
df.review[6187]

'         check in listed in   reviews in    done and done and done   i m pretty much a coffee shop expert  this morning  i waited   minutes in line for coffee  the guy in front of me waited     i make that point because there was no one in front of him   what there was was a barista who also had to tend the register who was preparing a  drink for someone who came in around minute   and didn t have to wait in line   it s     am     i came here because i wanted an alternative to the ridiculous parking at pacha and even drove through their lot just in case    i was praying for a great experience    that  ruined  it   come on    basic customer service in the morning says you handle the people in line in the order they come in    no special treatment   the dude in front of me was mad    and you could tell    but he kept his composure by doing what looked like a peepee dance     i on the other hand turned my powers to yelp   stinson s has a cool venue    the old converted gas station or veh

In [229]:
df.review[578]

'       i am a resident at imt and i like to stop here when i take my pup for a walk  they have poor customer service  boring demeanor  and i feel like the employees may need a few shots of espresso themselves  no energy  or welcoming smile at this place  i have years of retail management experience and this would never fly   the first time i stepped into the shop a couple weeks ago i was so happy that a coffee shop was so close to my place and i love supporting local  well     i peeked my head in to see if it was ok to bring my dog for a short moment to order and go  i saw a woman at a table near the register with hers so i figured it was fine but wanted to keep us by the door so not to disturb people relaxing  the lady s order was called and when she stepped up the cashier told her next time your dog is not allowed  she said ok sorry and went out in the patio to have her drink  i asked her if she minded that i tie mine by the door  and she told me i come here all the time and have ne

looks like topic 2 is negative reviews about customer service

We want our recommender to give us the name of a coffee shop, and not an individual reveiw, so we are going to aggregate the topic weights for all reveiws for a given shop

In [264]:
grouped_df = new_df.groupby(['name']).mean()
grouped_df

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alta’s Cafe,0.028647,0.011104,0.018421,0.007914,0.017422,0.007884,0.004334,0.007049,3.930000
Anderson’s Coffee Company,0.006288,0.000455,0.035010,0.001989,0.026647,0.017241,0.004508,0.010605,4.620000
Apanas Coffee & Beer,0.027755,0.002809,0.017791,0.016638,0.034814,0.005752,0.003983,0.003641,4.580882
Arturo’s Underground Cafe,0.012415,0.035405,0.031274,0.003378,0.020472,0.009828,0.006452,0.002625,4.300000
Bennu Coffee,0.031105,0.002171,0.021774,0.013902,0.021223,0.008568,0.011415,0.003386,4.130000
...,...,...,...,...,...,...,...,...,...
Thunderbird Coffee,0.035359,0.008218,0.021365,0.025463,0.016057,0.006235,0.004854,0.004642,3.970000
Trianon Coffee,0.021453,0.014480,0.022739,0.014326,0.033855,0.007879,0.004513,0.006964,4.020408
Tuscany At 360,0.031553,0.017411,0.019970,0.008707,0.010991,0.010074,0.000915,0.003101,3.818182
Venezia Italian Gelato,0.008464,0.001657,0.018749,0.002764,0.024429,0.001596,0.048571,0.001011,4.810000


In [265]:
#testing that the evaluation is working, rating is included as a metric
from sklearn.metrics import pairwise_distances
pairwise_distances(grouped_df.iloc[0].to_numpy().reshape(1,-1),grouped_df).argsort()

array([[ 0, 28, 69, 17, 39, 12, 73, 56, 55, 66, 67, 71, 74, 29,  7, 75,
        11, 15, 20, 61, 38, 48, 47, 14, 43,  4, 36, 65, 53, 51, 77,  8,
        32, 34, 54, 64, 25, 62, 60, 31, 35, 33,  3, 63, 18, 19, 10, 58,
        52, 57, 68, 45, 21,  9,  6, 44, 59, 50, 22, 41, 13, 27, 23,  2,
         1, 30, 16, 49, 24, 40, 76, 37, 26, 72,  5, 46, 42, 70]])

In [124]:
print(grouped_df.iloc[0])
print(grouped_df.iloc[28])
print(grouped_df.iloc[71])

0         0.028647
1         0.011104
2         0.018421
3         0.007914
4         0.017422
5         0.007884
6         0.004334
7         0.007049
rating    3.930000
Name: Alta’s Cafe , dtype: float64
0         0.031632
1         0.006454
2         0.020456
3         0.014290
4         0.022471
5         0.010089
6         0.002929
7         0.004606
rating    3.950000
Name: Genuine Joe , dtype: float64
0         0.005669
1         0.000293
2         0.014385
3         0.000059
4         0.026851
5         0.059677
6         0.001618
7         0.000443
rating    5.000000
Name: The Marvelous Vintage Tea Party Co. , dtype: float64


Looks like the most and least similar coffee shops based of topic is reasonable, we'll move on to seeing if we can put in a recommendation with just a simple keyword

In [196]:
key = ['atmosphere']

In [197]:
vt = cv2.transform(key)

In [198]:
tt = nmf_model.transform(vt)

In [199]:
pairwise_distances(tt,grouped_df.drop(columns='rating')).argsort()


array([[33,  2, 75, 31,  6, 41, 30, 19, 18, 22, 35, 27, 56, 73, 13, 34,
        37,  1, 28, 47,  9,  0, 43, 65, 68,  4, 70, 23, 53, 61, 32, 36,
        26, 44,  8, 46, 60, 24, 58, 29, 17, 59, 20, 51, 62,  7, 55, 69,
        12, 50, 48, 76, 25, 74, 38, 52,  3, 39, 77, 11, 57, 40, 54, 10,
         5, 78, 45, 15, 49, 71, 67, 21, 64, 63, 42, 16, 72, 66, 14]])

In [201]:
print(grouped_df.iloc[33].name,\
      grouped_df.iloc[2].name,\
      grouped_df.iloc[75].name)


Irie Bean Coffee Bar  Apanas Coffee & Beer  Trianon Coffee 


In [272]:
def get_coffee_recs(string_lst,n_recs=3, df=grouped_df,vect=cv2,model=nmf_model):
    vt = cv2.transform(string_lst)
    tt = model.transform(vt)
    top_n = pairwise_distances(tt,df.drop(columns='rating')).argsort().tolist()[0][:n_recs]
    recs = []
    for i in top_n:
        recs.append((df.iloc[i].name, df.iloc[i].rating.round(2)))
    return recs

In [266]:
keyword = ['friendly']
get_coffee_recs(keyword)

[('Irie Bean Coffee Bar ', 4.3),
 ('Live Oak Market ', 4.51),
 ('Apanas Coffee & Beer ', 4.58)]

In [273]:
keyword = ['latte']
get_coffee_recs(keyword)

[('Summermoon Coffee Bar ', 4.31),
 ('Brian’s Brew ', 4.84),
 ('Lola Savannah Coffee Downtown ', 5.0)]

In [240]:
keyword = ['work']
get_coffee_recs(keyword)

[('Flightpath Coffeehouse ', 4.23),
 ('Friends & Neighbors ', 4.55),
 ('Radio Coffee & Beer ', 4.0)]