In [23]:
import pandas as pd
import spacy
import numpy as np
import re
import string
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from pymongo import MongoClient
from sklearn.metrics import pairwise_distances
from joblib import dump
nlp = spacy.load('en_core_web_sm')

In [2]:
client = MongoClient("mongodb://localhost:27017")
client.list_database_names()

['admin', 'books', 'coffee_reviews', 'config', 'events', 'local', 'outings']

In [3]:
reviews_mongo = client.coffee_reviews
reviews_mongo.list_collection_names()

['reviews_col']

In [4]:
reviews_df = pd.DataFrame(list(reviews_mongo.reviews_col.aggregate([{'$unwind': '$reviews'}])))
reviews_df.columns = ['ID', 'shop_name', 'reviews']

In [5]:
files2 = ['Lighthouse Roasters',
          'Anchorhead Coffee',
        'Coffeeholic House',
        'Espresso Vivace Roasteria',
        'Milstead',
        'Moonshot Coffee',
        'Moore Coffee Shop',
        'Santo Coffee',
        'Storyville Coffee Company',
        'Sugar Bakery & Coffeehouse']

In [7]:
for i in range(0,10):
    lh = list(reviews_df.ID.unique())[i]
    name = files2[i]
    reviews_df.replace({lh: f'{name}'}, inplace=True)
reviews_df.ID.unique()

array(['Lighthouse Roasters', 'Anchorhead Coffee', 'Coffeeholic House',
       'Espresso Vivace Roasteria', 'Milstead', 'Moonshot Coffee',
       'Moore Coffee Shop', 'Santo Coffee', 'Storyville Coffee Company',
       'Sugar Bakery & Coffeehouse'], dtype=object)

In [8]:
reviews_df.drop(columns='shop_name', inplace=True)

In [9]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

reviews_df['reviews'] = reviews_df.reviews.map(alphanumeric).map(punc_lower)
reviews_df.head()

Unnamed: 0,ID,reviews
0,Lighthouse Roasters,this café s been around for quite some time an...
1,Lighthouse Roasters,these guys deserve five stars and here s why ...
2,Lighthouse Roasters,lighthouse is simply the best my roommates an...
3,Lighthouse Roasters,picked up a bag of original and their featured...
4,Lighthouse Roasters,would recommend this place i got a latte a...


In [12]:
docs = list(nlp.pipe(reviews_df.reviews))
reviews_df['spacy_doc'] = docs

In [13]:
docs_clean = [[w.lemma_.lower() for w in doc if (not w.is_stop and not w.is_punct and not w.like_num) \
               or (w.lemma_=='not')] for doc in docs]
reviews_df['docs_clean'] = docs_clean

In [14]:
docs_list_clean = [' '.join(doc) for doc in docs_clean]
docs_list_clean[0]

'café s time literally feel love go coffee   lighthouse understand craft   ve get latte   iced hot   amazing round espresso     bring dog   dog biscuit ready   '

In [16]:
stop_words = ENGLISH_STOP_WORDS.union(['coffee', 'gelato', 'good', 'great', 'like'])
cv = TfidfVectorizer(stop_words=stop_words, max_features=1000, min_df=10, max_df = .4, ngram_range = (1,3))
X = cv.fit_transform(docs_list_clean)

reviews_df_X = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())
reviews_df_X.shape

(6057, 1000)

In [17]:
nmf_model = NMF(8)
nmf_doc_topic = nmf_model.fit_transform(X)
nmf_doc_topic.shape



(6057, 8)

In [18]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [19]:
display_topics(nmf_model, cv.get_feature_names(), 10)


Topic  0
espresso, time, barista, ve, drink, cup, work, bean, come, seattle

Topic  1
latte, art, latte art, cute, waffle, nutella, mocha, moore, shop, come

Topic  2
ube, coffeeholic, drink, dream, coffeeholic dream, vietnamese, sweet, order, try, drizzle

Topic  3
pike, market, pike place, place market, pike place market, view, storyville, floor, pike market, away

Topic  4
brew, cold, cold brew, quaffle, nitro, honey, brew latte, cold brew latte, biscuit, pour

Topic  5
love, friendly, staff, shop, service, amazing, seattle, atmosphere, recommend, favorite

Topic  6
free, cake, chocolate, mug, free mug, chocolate cake, email, slice, sign, mocha

Topic  7
milk, latte, almond, almond milk, matcha, pistachio, croissant, milk latte, cinnamon, taste


In [20]:
new_df = pd.DataFrame(nmf_doc_topic)
new_df.columns = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7']

In [21]:
new_df[['name', 'review']] = reviews_df[['ID', 'reviews']]
new_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,name,review
0,0.031732,0.009188,0.000127,0.0,0.001266,0.020977,0.003625,0.015078,Lighthouse Roasters,this café s been around for quite some time an...
1,0.071553,0.0,0.010768,0.0,0.0,0.017008,0.003369,0.000795,Lighthouse Roasters,these guys deserve five stars and here s why ...
2,0.024035,0.007552,0.0,0.0,0.0,0.036384,0.003096,0.0,Lighthouse Roasters,lighthouse is simply the best my roommates an...
3,0.058073,0.0,0.005109,0.0,0.003933,0.021679,0.0,0.0,Lighthouse Roasters,picked up a bag of original and their featured...
4,0.0,0.088421,0.0,0.0,0.000409,0.087862,0.0,0.017836,Lighthouse Roasters,would recommend this place i got a latte a...


In [22]:
grouped_df = new_df.groupby(['name']).mean()
grouped_df.head()

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Anchorhead Coffee,0.021478,0.011447,0.006148,0.003323,0.038475,0.029756,0.006121,0.039415
Coffeeholic House,0.015423,0.007661,0.111339,0.001149,0.002082,0.024691,0.002943,0.00917
Espresso Vivace Roasteria,0.042371,0.011452,0.005088,0.0017,0.002634,0.019173,0.005391,0.010957
Lighthouse Roasters,0.042727,0.007023,0.00337,0.001586,0.004203,0.022793,0.003491,0.005139
Milstead,0.034366,0.009314,0.003101,0.003191,0.010293,0.033714,0.008532,0.010907


In [32]:
def get_coffee_recs(string_lst,n_recs=3, df=grouped_df,vect=cv,model=nmf_model):
    vt = cv.transform(string_lst)
    tt = model.transform(vt)
    top_n = pairwise_distances(tt,df).argsort().tolist()[0][:n_recs]
    recs = []
    for i in top_n:
        recs.append((df.iloc[i].name))
    return recs

In [34]:
keyword = ['friendly']
get_coffee_recs(keyword,n_recs=1)

['Santo Coffee']

It works! Now to save the custom dataframe, vectorizer, and model to be used in my web app 

In [38]:
grouped_df.to_csv('SeaCoffeeRecModel_df.csv')

In [37]:
dump(nmf_model, 'SeaCoffeeRecModel_nmf.joblib')
dump(cv, 'SeaCoffeeRecModel_cv.joblib')

['SeaCoffeeRecModel_cv.joblib']