In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

from nltk.tokenize import sent_tokenize, word_tokenize
from textblob import TextBlob

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import json
import pickle

%load_ext autoreload
%autoreload 2

In [2]:
with open('data/reviews_clean.json', 'rb') as f:
    data_full = json.load(f)
    
data = data_full[1115:1121]

review = data[1]['reviews']
menu = data[1]['menu_data'][0]['menu']

In [4]:
all_reviews = [d['reviews'] for d in data]
all_menus = [d['menu_data'][0]['menu'] for d in data]

In [54]:
flat_reviews_all = [r for review in all_reviews[1:4] for r in review]

In [55]:
review_sents_all = [sent_tokenize(r) for r in flat_reviews_all]
review_sents_all = [sent for sentence in review_sents_all for sent in sentence]

In [57]:
review_sents_all

['I am discerning when it comes to five star reviews.',
 "Oh, 4-star reviews, I'll hand those out with reckless abandon.",
 'Ha!',
 'However, a trip to Kirsh left me feeling reeeeally really good.',
 'The Beau and I went for brunch on a Saturday afternoon around 1 p.m.; props to the new Yelp feature I was unaware of, in which the host put our name into their system and I got a text link telling me our approximate wait time AND the number of parties in front of us!',
 'After a scant 10 minute wait, we were seating in their bright - though a tad tight - dining area.',
 'Thrillist recently toted Kirsh as having the best French Toast on the Upper West Side, and that is a very fair assessment!',
 'There are a variety of options - both sweet and savory - and he went for the sweet Cinnamon Pear version, complete with mascarpone cheese on the side.',
 'This truly epitomized excellent French toast: the bread was thick, but airy.',
 'The flavor was buttery, fruity, and spicy without being overwh

In [67]:
pos_tags = []

for i, sent in enumerate(review_sents_all):

    # Perform POS tagging
    tokenizer = TextBlob(sent)
    tags = tokenizer.tags    

    # Take the word, POS tag, and its label
    pos_tags.append(tags)

In [68]:
pos_tags

[[('I', 'PRP'),
  ('am', 'VBP'),
  ('discerning', 'VBG'),
  ('when', 'WRB'),
  ('it', 'PRP'),
  ('comes', 'VBZ'),
  ('to', 'TO'),
  ('five', 'CD'),
  ('star', 'NN'),
  ('reviews', 'NNS')],
 [('Oh', 'UH'),
  ('4-star', 'JJ'),
  ('reviews', 'NNS'),
  ('I', 'PRP'),
  ("'ll", 'MD'),
  ('hand', 'NN'),
  ('those', 'DT'),
  ('out', 'IN'),
  ('with', 'IN'),
  ('reckless', 'JJ'),
  ('abandon', 'NN')],
 [('Ha', 'NN')],
 [('However', 'RB'),
  ('a', 'DT'),
  ('trip', 'NN'),
  ('to', 'TO'),
  ('Kirsh', 'NNP'),
  ('left', 'VBD'),
  ('me', 'PRP'),
  ('feeling', 'VBG'),
  ('reeeeally', 'RB'),
  ('really', 'RB'),
  ('good', 'JJ')],
 [('The', 'DT'),
  ('Beau', 'NNP'),
  ('and', 'CC'),
  ('I', 'PRP'),
  ('went', 'VBD'),
  ('for', 'IN'),
  ('brunch', 'NN'),
  ('on', 'IN'),
  ('a', 'DT'),
  ('Saturday', 'NNP'),
  ('afternoon', 'NN'),
  ('around', 'RB'),
  ('1', 'CD'),
  ('p.m.', 'NN'),
  ('props', 'NNS'),
  ('to', 'TO'),
  ('the', 'DT'),
  ('new', 'JJ'),
  ('Yelp', 'NNP'),
  ('feature', 'NN'),
  ('I', 'PRP

In [None]:
NNP, NN, NNS

In [71]:
nps = []

for i, sent in enumerate(review_sents_all):

    # Perform POS tagging
    tokenizer = TextBlob(sent)
    tags = tokenizer.noun_phrases    

    # Take the word, POS tag, and its label
    nps.append(tags)

In [72]:
nps

[WordList(['star reviews']),
 WordList(['oh', '4-star reviews', "'ll hand", 'reckless abandon']),
 WordList(['ha']),
 WordList(['kirsh']),
 WordList(['beau', 'yelp', 'text link', 'and']),
 WordList([]),
 WordList(['thrillist', 'kirsh', 'toast', 'upper', 'side', 'fair assessment']),
 WordList(['cinnamon pear', 'mascarpone cheese']),
 WordList([]),
 WordList([]),
 WordList(['nice alternative']),
 WordList(["n't order", 'too', 'blte', 'blt', 'emmental cheese', 'blts', 'ever', 'life']),
 WordList(['blts']),
 WordList(['blt', 'sad sandwich']),
 WordList(['haaa']),
 WordList(['buttery roll']),
 WordList(['epic ranch dressing']),
 WordList(['damn']),
 WordList(['delicious']),
 WordList([]),
 WordList(['ha']),
 WordList([]),
 WordList([]),
 WordList(['accidents']),
 WordList(['kirsh']),
 WordList(['own lovely menu']),
 WordList(['kirsh', 'epic gem']),
 WordList(['went']),
 WordList(['lox french']),
 WordList(['-lox french']),
 WordList([]),
 WordList(['pretty light']),
 WordList([]),
 WordList

In [73]:
review_sents_all

['I am discerning when it comes to five star reviews.',
 "Oh, 4-star reviews, I'll hand those out with reckless abandon.",
 'Ha!',
 'However, a trip to Kirsh left me feeling reeeeally really good.',
 'The Beau and I went for brunch on a Saturday afternoon around 1 p.m.; props to the new Yelp feature I was unaware of, in which the host put our name into their system and I got a text link telling me our approximate wait time AND the number of parties in front of us!',
 'After a scant 10 minute wait, we were seating in their bright - though a tad tight - dining area.',
 'Thrillist recently toted Kirsh as having the best French Toast on the Upper West Side, and that is a very fair assessment!',
 'There are a variety of options - both sweet and savory - and he went for the sweet Cinnamon Pear version, complete with mascarpone cheese on the side.',
 'This truly epitomized excellent French toast: the bread was thick, but airy.',
 'The flavor was buttery, fruity, and spicy without being overwh

In [81]:
np_counts = []

for i, sent in enumerate(review_sents_all):

    # Perform POS tagging
    tokenizer = TextBlob(sent)
    counts = tokenizer.np_counts
    s = sum(counts.values())
    total = len(tokenizer)

    # Take the word, POS tag, and its label
    np_counts.append(s)

In [82]:
np_counts

[1,
 4,
 1,
 1,
 4,
 0,
 6,
 2,
 0,
 0,
 1,
 8,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 3,
 0,
 1,
 0,
 4,
 0,
 1,
 2,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 6,
 4,
 2,
 8,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 2,
 2,
 0,
 1,
 1,
 0,
 0,
 0,
 2,
 0,
 1,
 5,
 4,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 4,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 2,
 2,
 0,
 2,
 2,
 0,
 0,
 1,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 2,
 0,
 0,
 2,
 0,
 1,
 2,
 1,
 1,
 1,
 1,
 0,
 2,
 0,
 0,
 3,
 2,
 2,
 0,
 1,
 0,
 2,
 3,
 0,
 0,
 0,
 3,
 2,
 3,
 0,
 6,
 1,
 1,
 0,
 2,
 0,
 0,
 1,
 2,
 1,
 1,
 1,
 2,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 5,
 6,
 2,
 0,
 1,
 2,
 1,
 2,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 2,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 2,
 2,
 3,
 0,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 3,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 2,
 0,
 1,
 0,
 2,
 2,
 2,
 1,
 2,
 1,
 0,
 0,
 0,
 0,


In [59]:
pos_tags = [t for tupl in pos_tags for t in tupl]

In [60]:
pos_tags[0] + ('test',)

('I', 'PRP', 'test')

In [61]:
for i, tag in enumerate(pos_tags):
    print(i, tag)

0 ('I', 'PRP')
1 ('am', 'VBP')
2 ('discerning', 'VBG')
3 ('when', 'WRB')
4 ('it', 'PRP')
5 ('comes', 'VBZ')
6 ('to', 'TO')
7 ('five', 'CD')
8 ('star', 'NN')
9 ('reviews', 'NNS')
10 ('Oh', 'UH')
11 ('4-star', 'JJ')
12 ('reviews', 'NNS')
13 ('I', 'PRP')
14 ("'ll", 'MD')
15 ('hand', 'NN')
16 ('those', 'DT')
17 ('out', 'IN')
18 ('with', 'IN')
19 ('reckless', 'JJ')
20 ('abandon', 'NN')
21 ('Ha', 'NN')
22 ('However', 'RB')
23 ('a', 'DT')
24 ('trip', 'NN')
25 ('to', 'TO')
26 ('Kirsh', 'NNP')
27 ('left', 'VBD')
28 ('me', 'PRP')
29 ('feeling', 'VBG')
30 ('reeeeally', 'RB')
31 ('really', 'RB')
32 ('good', 'JJ')
33 ('The', 'DT')
34 ('Beau', 'NNP')
35 ('and', 'CC')
36 ('I', 'PRP')
37 ('went', 'VBD')
38 ('for', 'IN')
39 ('brunch', 'NN')
40 ('on', 'IN')
41 ('a', 'DT')
42 ('Saturday', 'NNP')
43 ('afternoon', 'NN')
44 ('around', 'RB')
45 ('1', 'CD')
46 ('p.m.', 'NN')
47 ('props', 'NNS')
48 ('to', 'TO')
49 ('the', 'DT')
50 ('new', 'JJ')
51 ('Yelp', 'NNP')
52 ('feature', 'NN')
53 ('I', 'PRP')
54 ('was',

1408 ('too', 'RB')
1409 ('It', 'PRP')
1410 ('took', 'VBD')
1411 ('me', 'PRP')
1412 ('a', 'DT')
1413 ('while', 'NN')
1414 ('to', 'TO')
1415 ('make', 'VB')
1416 ('these', 'DT')
1417 ('choices', 'NNS')
1418 ('as', 'IN')
1419 ('I', 'PRP')
1420 ("'m", 'VBP')
1421 ('almost', 'RB')
1422 ('positive', 'JJ')
1423 ('that', 'IN')
1424 ('everything', 'NN')
1425 ('else', 'RB')
1426 ('on', 'IN')
1427 ('that', 'DT')
1428 ('menu/shelf', 'NN')
1429 ('was', 'VBD')
1430 ('just', 'RB')
1431 ('as', 'RB')
1432 ('good', 'JJ')
1433 ('I', 'PRP')
1434 ('wish', 'VBP')
1435 ('people', 'NNS')
1436 ('at', 'IN')
1437 ('the', 'DT')
1438 ('cashier', 'NN')
1439 ('smiled', 'VBD')
1440 ('more', 'RBR')
1441 ('but', 'CC')
1442 ('then', 'RB')
1443 ('everything', 'NN')
1444 ('matters', 'NNS')
1445 ('less', 'RBR')
1446 ('after', 'IN')
1447 ('you', 'PRP')
1448 ('bite', 'VBP')
1449 ('into', 'IN')
1450 ('carbs', 'JJ')
1451 ('heaven', 'NN')
1452 ('Will', 'MD')
1453 ('be', 'VB')
1454 ('back', 'RB')
1455 ('for', 'IN')
1456 ('that', 

2373 ('more', 'RBR')
2374 ('minty', 'JJ')
2375 ('while', 'IN')
2376 ('it', 'PRP')
2377 ('looked', 'VBD')
2378 ('like', 'IN')
2379 ('a', 'DT')
2380 ('smoothie', 'NN')
2381 ('gone', 'VBN')
2382 ('wrong', 'RB')
2383 ('it', 'PRP')
2384 ('tasted', 'VBD')
2385 ('better', 'RBR')
2386 ('when', 'WRB')
2387 ('it', 'PRP')
2388 ('was', 'VBD')
2389 ('slightly', 'RB')
2390 ('melted', 'VBN')
2391 ('Our', 'PRP$')
2392 ('server', 'NN')
2393 ('was', 'VBD')
2394 ('friendly', 'JJ')
2395 ('and', 'CC')
2396 ('we', 'PRP')
2397 ('received', 'VBD')
2398 ('our', 'PRP$')
2399 ('orders', 'NNS')
2400 ('accurately', 'RB')
2401 ('There', 'EX')
2402 ('was', 'VBD')
2403 ('a', 'DT')
2404 ('slight', 'JJ')
2405 ('lag', 'NN')
2406 ('in', 'IN')
2407 ('getting', 'VBG')
2408 ('the', 'DT')
2409 ('check', 'NN')
2410 ('but', 'CC')
2411 ('it', 'PRP')
2412 ('gave', 'VBD')
2413 ('us', 'PRP')
2414 ('some', 'DT')
2415 ('time', 'NN')
2416 ('to', 'TO')
2417 ('digest', 'VB')
2418 ('Just', 'RB')
2419 ('note', 'NN')
2420 ('that', 'IN')
2

3673 ('you', 'PRP')
3674 ('a', 'DT')
3675 ('much', 'RB')
3676 ('better', 'JJR')
3677 ('lunch', 'NN')
3678 ('elsewhere', 'RB')
3679 ('Just', 'RB')
3680 ('got', 'VBN')
3681 ('chicken', 'VBN')
3682 ('fried', 'JJ')
3683 ('rice', 'NN')
3684 ('and', 'CC')
3685 ('the', 'DT')
3686 ('chicken', 'NN')
3687 ('is', 'VBZ')
3688 ('all', 'DT')
3689 ('burnt', 'NN')
3690 ('The', 'DT')
3691 ('rice', 'NN')
3692 ('has', 'VBZ')
3693 ('a', 'DT')
3694 ('burnt', 'NN')
3695 ('smell', 'NN')
3696 ('now..', 'RB')
3697 ('wont', 'VBD')
3698 ('go', 'VB')
3699 ('back', 'RB')
3700 ('Why', 'WRB')
3701 ('is', 'VBZ')
3702 ("n't", 'RB')
3703 ('there', 'EX')
3704 ('zero', 'CD')
3705 ('stars', 'NNS')
3706 ('I', 'PRP')
3707 ("'ve", 'VBP')
3708 ('tolerated', 'VBN')
3709 ('the', 'DT')
3710 ('horrible', 'JJ')
3711 ('customer', 'NN')
3712 ('service', 'NN')
3713 ('at', 'IN')
3714 ('Ollie', 'NNP')
3715 ("'s", 'POS')
3716 ('for', 'IN')
3717 ('a', 'DT')
3718 ('while', 'NN')
3719 ('because', 'IN')
3720 ('it', 'PRP')
3721 ('was', 'VBD'

4694 ('chicken', 'JJ')
4695 ('dumplings', 'NNS')
4696 ('again', 'RB')
4697 ('meh', 'NN')
4698 ('Worst', 'NNP')
4699 ('part', 'NN')
4700 ('was', 'VBD')
4701 ('it', 'PRP')
4702 ('took', 'VBD')
4703 ('an', 'DT')
4704 ('hour', 'NN')
4705 ('and', 'CC')
4706 ('15', 'CD')
4707 ('minutes', 'NNS')
4708 ('and', 'CC')
4709 ('I', 'PRP')
4710 ('live', 'VBP')
4711 ('not', 'RB')
4712 ('even', 'RB')
4713 ('10', 'CD')
4714 ('minutes', 'NNS')
4715 ('away', 'RB')
4716 ('Called', 'VBN')
4717 ('3', 'CD')
4718 ('times', 'NNS')
4719 ('and', 'CC')
4720 ('they', 'PRP')
4721 ('continually', 'RB')
4722 ('said', 'VBD')
4723 ('he', 'PRP')
4724 ('left', 'VBD')
4725 ('10', 'CD')
4726 ('minutes', 'NNS')
4727 ('ago', 'RB')
4728 ('Unfortunately', 'RB')
4729 ('food', 'NN')
4730 ('was', 'VBD')
4731 ('cold', 'VBN')
4732 ('by', 'IN')
4733 ('the', 'DT')
4734 ('time', 'NN')
4735 ('we', 'PRP')
4736 ('got', 'VBD')
4737 ('it', 'PRP')
4738 ('I', 'PRP')
4739 ('would', 'MD')
4740 ('recommend', 'VB')
4741 ('ordering', 'VBG')
4742 (

5796 ('foodies', 'NNS')
5797 ('know', 'VBP')
5798 ('that', 'DT')
5799 ('I', 'PRP')
5800 ('had', 'VBD')
5801 ('the3', 'VBN')
5802 ('meatballs', 'NNS')
5803 ('with', 'IN')
5804 ('sauce', 'NN')
5805 ('It', 'PRP')
5806 ('was', 'VBD')
5807 ('delicious', 'JJ')
5808 ('I', 'PRP')
5809 ('was', 'VBD')
5810 ('there', 'RB')
5811 ('early', 'RB')
5812 ('on', 'IN')
5813 ('a', 'DT')
5814 ('Saturday', 'NNP')
5815 ('one', 'CD')
5816 ('of', 'IN')
5817 ('the', 'DT')
5818 ('first', 'JJ')
5819 ('Service', 'NNP')
5820 ('was', 'VBD')
5821 ('good', 'JJ')
5822 ('I', 'PRP')
5823 ('sat', 'VBD')
5824 ('outside', 'RB')
5825 ('If', 'IN')
5826 ('you', 'PRP')
5827 ('want', 'VBP')
5828 ('great', 'JJ')
5829 ('Italian', 'JJ')
5830 ('food', 'NN')
5831 ('and', 'CC')
5832 ('an', 'DT')
5833 ('overall', 'JJ')
5834 ('good', 'JJ')
5835 ('experience', 'NN')
5836 ('this', 'DT')
5837 ('place', 'NN')
5838 ('is', 'VBZ')
5839 ('it', 'PRP')
5840 ('They', 'PRP')
5841 ('probably', 'RB')
5842 ('have', 'VBP')
5843 ('the', 'DT')
5844 ('bes

6858 ('over', 'RP')
6859 ('the', 'DT')
6860 ('restaurant', 'NN')
6861 ('It', 'PRP')
6862 ('does', 'VBZ')
6863 ('get', 'VB')
6864 ('loud', 'JJ')
6865 ('but', 'CC')
6866 ('not', 'RB')
6867 ('too', 'RB')
6868 ('much', 'RB')
6869 ('so', 'IN')
6870 ('that', 'IN')
6871 ('you', 'PRP')
6872 ('ca', 'MD')
6873 ("n't", 'RB')
6874 ('hear', 'VB')
6875 ('each', 'DT')
6876 ('other', 'JJ')
6877 ('and', 'CC')
6878 ('have', 'VBP')
6879 ('a', 'DT')
6880 ('good', 'JJ')
6881 ('conversation', 'NN')
6882 ('In', 'IN')
6883 ('terms', 'NNS')
6884 ('of', 'IN')
6885 ('food', 'NN')
6886 ('EAT', 'VB')
6887 ('THE', 'DT')
6888 ('BREAD', 'NNP')
6889 ('Do', 'VBP')
6890 ("n't", 'RB')
6891 ('give', 'VB')
6892 ('me', 'PRP')
6893 ('that', 'IN')
6894 ('crap', 'NN')
6895 ('that', 'IN')
6896 ('you', 'PRP')
6897 ("'re", 'VBP')
6898 ('not', 'RB')
6899 ('doing', 'VBG')
6900 ('carbs', 'NNS')
6901 ('you', 'PRP')
6902 ("'re", 'VBP')
6903 ('at', 'IN')
6904 ('an', 'DT')
6905 ('Italian', 'JJ')
6906 ('restaurant', 'NN')
6907 ('Whatever

In [62]:
labels = ['I' for i in range(len(pos_tags))]

In [None]:
entities = [114, 115, 144, 145, 149, 150, 158, 159, 161