In [3]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [6]:
text = "Tea is healthy and calming, don\'t you think?"

doc = nlp(text)

In [8]:
for token in doc:
    print(token)

Tea
is
healthy
and
calming
,
do
n't
you
think
?


In [10]:
# a "lemma" is the base form of a word
# a "stopword" is a word that doesn't contain much information

print(
    "Token \t\tLemma \t\tStopword"
)

print("-" * 40)

for token in doc:
    print(
        f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}"
    )

Token 		Lemma 		Stopword
----------------------------------------
Tea		tea		False
is		be		True
healthy		healthy		False
and		and		True
calming		calm		False
,		,		False
do		do		True
n't		n't		True
you		you		True
think		think		False
?		?		False


In [12]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab, attr = 'LOWER')

In [14]:
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
patterns = [nlp(text) for text in terms]
matcher.add('TerminologyList', patterns)

In [19]:
text = \
    "Glowing review overall, and some really interesting side-by-side " \
    "photography tests pitting the iPhone 11 Pro against the " \
    "Galaxy Note 10 Plus and last year\'s iPhone XS and Google Pixel 3."

text_doc = nlp(text)

matches = matcher(text_doc)
print(matches)

[(3766102292120407359, 17, 19), (3766102292120407359, 22, 24), (3766102292120407359, 30, 32), (3766102292120407359, 33, 35)]


In [21]:
match_id, start, end = matches[0]
print(
    f"{nlp.vocab.strings[match_id]} match: ",
    text_doc[start:end]
)

TerminologyList match:  iPhone 11


In [26]:
import pandas as pd

reviews_df = pd.read_json('./data/restaurant.json')

menu = ["Cheese Steak", "Cheesesteak", "Steak and Cheese", "Italian Combo", "Tiramisu", "Cannoli",
        "Chicken Salad", "Chicken Spinach Salad", "Meatball", "Pizza", "Pizzas", "Spaghetti",
        "Bruchetta", "Eggplant", "Italian Beef", "Purista", "Pasta", "Calzones",  "Calzone",
        "Italian Sausage", "Chicken Cutlet", "Chicken Parm", "Chicken Parmesan", "Gnocchi",
        "Chicken Pesto", "Turkey Sandwich", "Turkey Breast", "Ziti", "Portobello", "Reuben",
        "Mozzarella Caprese",  "Corned Beef", "Garlic Bread", "Pastrami", "Roast Beef",
        "Tuna Salad", "Lasagna", "Artichoke Salad", "Fettuccini Alfredo", "Chicken Parmigiana",
        "Grilled Veggie", "Grilled Veggies", "Grilled Vegetable", "Mac and Cheese", "Macaroni",  
         "Prosciutto", "Salami"]

reviews_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
109,lDJIaF4eYRF4F7g6Zb9euw,lb0QUR5bc4O-Am4hNq9ZGg,r5PLDU-4mSbde5XekTXSCA,4,2,0,0,I used to work food service and my manager at ...,2013-01-27 17:54:54
1013,vvIzf3pr8lTqE_AOsxmgaA,MAmijW4ooUzujkufYYLMeQ,r5PLDU-4mSbde5XekTXSCA,4,0,0,0,We have been trying Eggplant sandwiches all ov...,2015-04-15 04:50:56
1204,UF-JqzMczZ8vvp_4tPK3bQ,slfi6gf_qEYTXy90Sw93sg,r5PLDU-4mSbde5XekTXSCA,5,1,0,0,Amazing Steak and Cheese... Better than any Ph...,2011-03-20 00:57:45
1251,geUJGrKhXynxDC2uvERsLw,N_-UepOzAsuDQwOUtfRFGw,r5PLDU-4mSbde5XekTXSCA,1,0,0,0,Although I have been going to DeFalco's for ye...,2018-07-17 01:48:23
1354,aPctXPeZW3kDq36TRm-CqA,139hD7gkZVzSvSzDPwhNNw,r5PLDU-4mSbde5XekTXSCA,2,0,0,0,"Highs: Ambience, value, pizza and deserts. Thi...",2018-01-21 10:52:58


In [33]:
import spacy
from spacy.matcher import PhraseMatcher

test_review_index = 14
test_text = reviews_df.iloc[test_review_index]['text']
test_text

"The Il Purista sandwich has become a staple of my life. Mozzarella, basil, prosciutto, roasted red peppers and balsamic vinaigrette blend into a front runner for the best sandwich in the valley. Goes great with sparkling water or a beer. \n\nDeFalco's also has other Italian fare such as a delicious meatball sub and classic pastas."

In [35]:
nlp = spacy.blank('en')
review_doc = nlp(test_text)

matcher = PhraseMatcher(nlp.vocab, attr = 'LOWER')

menu_tokens = [nlp(item) for item in menu]
matcher.add('MenuItems', menu_tokens)

matches = matcher(review_doc)
matches

[(533989841420491665, 2, 3),
 (533989841420491665, 16, 17),
 (533989841420491665, 58, 59)]

In [38]:
for token_no, start, end in matches:
    print(review_doc[start:end].text.lower())

purista
prosciutto
meatball


In [39]:
from collections import defaultdict

item_ratings = defaultdict(list)

for index, review in data.iterrows():
    doc = nlp(review['text'])
    matches = matcher(doc)
    
    found_items: set = {
        doc[start:end].text.lower() for token_no, start, end in matches
    }

    for item in found_items:
        item_ratings[item].append(review['stars'])

In [42]:
from numpy import mean

mean_ratings: dict = {
    item_name: mean(ratings) for item_name, ratings in item_ratings.items()
}
    
worst_item = min(mean_ratings, key = mean_ratings.get)
worst_item

'chicken cutlet'

In [44]:
counts: dict = {
    item: len(ratings) for item, ratings in item_ratings.items()
}
    
item_counts = sorted(counts, key = counts.get, reverse = True)

for item in item_counts:
    print(
        f"{item:>25}{counts[item]:>5}"
    )

                    pizza  265
                    pasta  206
                 meatball  128
              cheesesteak   97
             cheese steak   76
                  cannoli   72
                  calzone   72
                 eggplant   69
                  purista   63
                  lasagna   59
          italian sausage   53
               prosciutto   50
             chicken parm   50
             garlic bread   39
                  gnocchi   37
                spaghetti   36
                 calzones   35
                   pizzas   32
                   salami   28
            chicken pesto   27
             italian beef   25
            italian combo   21
                 tiramisu   21
                     ziti   21
         chicken parmesan   19
       chicken parmigiana   17
               portobello   14
           mac and cheese   11
           chicken cutlet   10
         steak and cheese    9
                 pastrami    9
               roast beef    7
       f

In [47]:
sorted_ratings = sorted(mean_ratings, key = mean_ratings.get)

print("Worst rated menu items:\n")
for item in sorted_ratings[:10]:
    print(
        f"{item:20} avg rating: {mean_ratings[item]:.2f} \tcount: {counts[item]}"
    )
    
print('\n\n\n')
    
print("Best rated menu items:\n")
for item in sorted_ratings[-10:]:
    print(
        f"{item:20} avg rating: {mean_ratings[item]:.2f} \tcount: {counts[item]}"
    )

Worst rated menu items:

chicken cutlet       avg rating: 3.40 	count: 10
turkey sandwich      avg rating: 3.80 	count: 5
spaghetti            avg rating: 3.89 	count: 36
italian beef         avg rating: 3.92 	count: 25
tuna salad           avg rating: 4.00 	count: 5
macaroni             avg rating: 4.00 	count: 5
italian combo        avg rating: 4.05 	count: 21
garlic bread         avg rating: 4.13 	count: 39
roast beef           avg rating: 4.14 	count: 7
eggplant             avg rating: 4.16 	count: 69




Best rated menu items:

chicken pesto        avg rating: 4.56 	count: 27
chicken salad        avg rating: 4.60 	count: 5
purista              avg rating: 4.67 	count: 63
prosciutto           avg rating: 4.68 	count: 50
reuben               avg rating: 4.75 	count: 4
steak and cheese     avg rating: 4.89 	count: 9
artichoke salad      avg rating: 5.00 	count: 5
fettuccini alfredo   avg rating: 5.00 	count: 6
turkey breast        avg rating: 5.00 	count: 1
corned beef          avg r