In [53]:
import pandas as pd

ngrams = pd.read_csv('keywords.csv')

In [54]:
ngrams.head()

Unnamed: 0,0,1
0,great food great,4.036238e-10
1,good food good,4.858313e-10
2,food great service,6.220824e-10
3,good food great,6.725268e-10
4,food great food,6.960949e-10


In [55]:
ngrams.columns = ['n-gram', 'relevance']

In [56]:
ngrams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   n-gram     10000 non-null  object 
 1   relevance  10000 non-null  float64
dtypes: float64(1), object(1)
memory usage: 156.4+ KB


In [57]:
stopwords = ['good', 'great', 'amazing', 'love', 'nice', 'excellent', 'awesome','restaurant', 'google']

In [58]:
from collections import OrderedDict
def RemoveStop(row):
  tokens = row.split()
  tokens = list(OrderedDict.fromkeys(tokens))
  return ' '.join([t for t in tokens if t not in stopwords])


In [59]:
ngrams['n-gram'] = ngrams['n-gram'].apply(RemoveStop)

In [60]:
ngrams.head(10)

Unnamed: 0,n-gram,relevance
0,food,4.036238e-10
1,food,4.858313e-10
2,food service,6.220824e-10
3,food,6.725268e-10
4,food,6.960949e-10
5,food,8.671508e-10
6,service food,8.909668e-10
7,service food,9.574637e-10
8,food service,1.007647e-09
9,food,1.063216e-09


In [61]:
# Go thru each row of ngrams. If we haven't seen the string before, append to the list
unique_ngrams = {}

def sameTokens(s1, s2):

  return (sorted(s1.split()) == sorted(s2.split()))

print(sameTokens("food service", "service food"))

def filterUnique(n, r):
  # If n-gram with same tokens already exists, exit
  for k in unique_ngrams.keys():
    if sameTokens(n, k):
      return
  # Otherwise, add n-gram to list of unique n-grams
  unique_ngrams[n] = r


ngrams.apply(lambda x : filterUnique(x['n-gram'], x['relevance']), axis=1)
# remove random empty n-gram
unique_ngrams.pop('')

True


5.4801104909919256e-08

**NOTE: This is the list of most relevant n-grams with all the fluff removed. Still TBD how we will group these n-grams together. Also how we will detect topics in sentences from reviews...**

In [62]:
for k in list(unique_ngrams.keys()):
  print(k)

food
food service
service
place food
customer service
place
food friendly
staff food
service place
food prices
delicious food
food fast service
food friendly service
food fast
back food
time food
service delicious food
food friendly staff
friendly service
food customer
staff
customer service food
fast food place
food atmosphere
friendly staff
pretty food
fast service
people food
mexican food
food original
fresh food
pizza
prices
quality food
atmosphere
eat food
service delicious
food clean
staff service
food drinks
people
fast friendly service
service friendly staff
place delicious food
time service
experience food
order food
staff place
time
food quick service
service prices
super food
burgers
delicious place
food beer
menu food
drinks
back service
food quick
service original
tacos food
sacramento food
burgers food
food tastes
back place
location food
food foods
service pizza
wait food
burger food
food wonderful
friendly place
atmosphere service
food portions
recommend food
service fr

# Testing Gensim for identifying topics in sentences

In [63]:
reviews = pd.read_json('data/ys-reviews-with-categories.json')

restuarants = reviews.loc[reviews.category == "restaurant"]

In [65]:
restuarants['text'].head(10)

175    Decent Chinese Food.\n\nThe Hunan items are my...
176    We've been coming here for well over 15 years....
177    Great food, decent prices.  You get A lot of f...
178    Another great meal. Great service dine in or d...
179    Great authentic sezchuan style food. Family st...
180    Thanked me for coming in, very friendly, ready...
181         Great food and service at a reasonable price
182        Friendly service and tasty, inexpensive food.
183    Great prices for quality food. Service is A+ too.
184    Great customer service! Food is not that great...
Name: text, dtype: object

In [68]:
import nltk
nltk.sent_tokenize(data)

['It is always enjoyable at Zocalo.',
 'I usually have the burrito but decided to have the camarone tacos, good choice.',
 'The table also had the tacos Americana, very good as well.',
 'Considering COVID restrictions service was good and we all felt safe.',
 'A good time for sure.']

In [69]:
import gensim.downloader as api
# Credit: https://nlp.stanford.edu/projects/glove/
key_vec = api.load('glove-wiki-gigaword-100')

In [70]:
# Classify if a phrase is similar to the words in a list
# comp is a list of words related to a category, e.g. restaurant-related
def is_it_similar(phrase, comp ,threshold):
  words = phrase.split(' ')
  for w in words:
    for c in comp:
      try:
        sim_score = key_vec.similarity(w, c)
      except KeyError:
        sim_score = 0
      if sim_score > threshold:
        return True
  return False

In [76]:
sentences = nltk.sent_tokenize(data)
for s in sentences:
  print(is_it_similar(s, ['food'], threshold=0.5))

False
True
True
True
True


In [83]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

stop_words = stopwords.words("english")
    
def removed_stopwords(text: str):
    # make lower case and remove all punctuations
    text = text.lower().translate(str.maketrans('', '', punctuation))
    tokens = word_tokenize(text)
    # filter stopwords    
    return [word for word in tokens if word not in stop_words]

corpus = removed_stopwords("It is always enjoyable at Zocalo.  I usually have the burrito but decided to have the camarone tacos, good choice.  The table also had the tacos Americana, very good as well.  Considering COVID restrictions service was good and we all felt safe.  A good time for sure.")

Observation: GloVE does not seem to be associate food names with the word 'food' very well. Instead, it seems to return words often used with food. When someone mentions a specific food, they are less likely to explicitly say "food"

In [88]:
for w in corpus:
  try:
    sim_score = key_vec.similarity(w, 'food')
  except KeyError:
    sim_score = 0
  print(w, ' ', sim_score)
print(key_vec.similarity('chicken', 'food'))

always   0.40951824
enjoyable   0.17299005
zocalo   -0.008293062
usually   0.47720778
burrito   0.12126965
decided   0.3734138
camarone   0
tacos   0.22753528
good   0.502181
choice   0.47371292
table   0.45104885
also   0.47900337
tacos   0.22753528
americana   0.055896163
good   0.502181
well   0.5782533
considering   0.33139908
covid   0
restrictions   0.41900745
service   0.44641882
good   0.502181
felt   0.3108178
safe   0.5820462
good   0.502181
time   0.45150104
sure   0.4863005
0.5649143


In [124]:

key_vec.most_similar(positive=['staff', 'service','waitress'], negative=['officer'], topn=10)

[('job', 0.6257901191711426),
 ('working', 0.606554388999939),
 ('employees', 0.5995060801506042),
 ('worker', 0.5991743206977844),
 ('dining', 0.5929757952690125),
 ('services', 0.5781890749931335),
 ('breakfast', 0.5769687294960022),
 ('guest', 0.5760219097137451),
 ('guests', 0.575421154499054),
 ('hostess', 0.57537841796875)]

# Testing Doc2Vec embeddings

In [125]:
# Credit: https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Sample corpus
documents = ["The food is ok at this spot however the customer service could use some improvement. \
The employees are to busy chatting with each other while customers are waiting for their \
food that is just sitting there ready to go. Hopefully the management sees this and teaches \
their employees to do better.",
"Thanked me for coming in, very friendly, ready with the food in 15 min. \
for takeout and it was delicious! Great food and friendly service, I'd recommend to anybody.",
"I tried this place right when they opened and everything was perfect. \
Crisp crust, fresh toppings. Lots of menu items to choose from other than pizza.\
I’ve been back a handful of times and every order has been on point. Really nice spot.",
"Pizza and wings were both delicious. Best pizza I’ve had in a while. \
We will ABSOLUTELY be returning even though it is not nearby. \
I left a Yelp review and am leaving one here too because I feel that strongly. Dang good.",
"This location has an entirely new staff committed to excellent service and product. \
Kudos to management for seeing the need for change in this location. \
I highly recommend you try this location again.",
"The food is decent, but staff are talking a lot among themselves, leading to delays.",
"I love how wonderful the staff and customer service are at this restaurant.\
I'm never disappointed when I leave!"
]

documents_df=pd.DataFrame(documents,columns=['documents'])

# remove special characters and stop words
stop_words_l=stopwords.words('english')
documents_df['documents_cleaned']=documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )

tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(documents_df.documents_cleaned)]
model_d2v = Doc2Vec(vector_size=100,alpha=0.025, min_count=1)
  
model_d2v.build_vocab(tagged_data)

for epoch in range(100):
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.epochs)
    
document_embeddings=np.zeros((documents_df.shape[0],100))

for i in range(len(document_embeddings)):
    document_embeddings[i]=model_d2v.docvecs[i]
    
    
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)

# doc id is the index of document in corpus that we use as reference
# which documents in the corpus are most similar to the given document?
def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Document: {documents_df.iloc[doc_id]["documents"]}')
    print ('\n')
    print ('Similar Documents:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Document: {documents_df.iloc[ix]["documents"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oliviashen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oliviashen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  document_embeddings[i]=model_d2v.docvecs[i]


In [105]:
most_similar(5,pairwise_similarities,'Cosine Similarity')
most_similar(5,pairwise_differences,'Euclidean Distance')

Document: The food is decent, but staff are talking a lot among themselves, leading to delays.


Similar Documents:


Document: I love how wonderful the staff and customer service are at this restaurant.I'm never disappointed when I leave!
Cosine Similarity : 0.43725193967340836


Document: This location has an entirely new staff committed to excellent service and product. Kudos to management for seeing the need for change in this location. I highly recommend you try this location again.
Cosine Similarity : 0.41302499293439804


Document: Thanked me for coming in, very friendly, ready with the food in 15 min. for takeout and it was delicious! Great food and friendly service, I'd recommend to anybody.
Cosine Similarity : 0.4038652428632752


Document: Pizza and wings were both delicious. Best pizza I’ve had in a while. We will ABSOLUTELY be returning even though it is not nearby. I left a Yelp review and am leaving one here too because I feel that strongly. Dang good.
Cosine Similarity 

# Testing Sense2Vec Embeddings

In [126]:
%pip install sense2vec

Collecting sense2vec
  Downloading sense2vec-2.0.2-py2.py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m606.6 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting spacy<4.0.0,>=3.0.0 (from sense2vec)
  Obtaining dependency information for spacy<4.0.0,>=3.0.0 from https://files.pythonhosted.org/packages/ca/f3/609bb7512cad1f02af13daa23aa433b931da34c502211f29fd47dceff624/spacy-3.7.2-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading spacy-3.7.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting wasabi<1.2.0,>=0.8.1 (from sense2vec)
  Obtaining dependency information for wasabi<1.2.0,>=0.8.1 from https://files.pythonhosted.org/packages/8f/69/26cbf0bad11703241cb84d5324d868097f7a8faf2f1888354dac8883f3fc/wasabi-1.1.2-py3-none-any.whl.metadata
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.0 (from sense2vec)
  Obtaining dependency information for srsly<3.0.0,>=2.4

**The Sense2Vec pre-trained Reddit model is much better with food-related vocabulary**

Go to this link to download a zip archive of the Reddit Sense2Vec model: https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz

In [23]:
from sense2vec import Sense2Vec
s2v = Sense2Vec().from_disk("s2v_old")
most_similar = s2v.most_similar("food|NOUN", n=10)

In [24]:
s2v.most_similar("food|NOUN", n=100)
s2v.most_similar("restaurant|NOUN", n=50)
s2v.most_similar("waiter|NOUN", n=50)
# if more than one key is provided, the average of the vectors is used
s2v.most_similar(["mexican_food|NOUN", "waiter|NOUN"], n=50)
s2v.most_similar("Chipotle|ORG", n=10)
s2v.most_similar(["clean|ADJ", "waiter|NOUN"], n=50)

[('waitress|NOUN', 0.8123),
 ('wait_staff|NOUN', 0.7909),
 ('waiter/waitress|NOUN', 0.7853),
 ('busboy|NOUN', 0.7784),
 ('bus_boy|NOUN', 0.7783),
 ('restaurant|NOUN', 0.7765),
 ('kitchen_staff|NOUN', 0.7693),
 ('hostess|NOUN', 0.7682),
 ('waitstaff|NOUN', 0.7624),
 ('bartender|NOUN', 0.762),
 ('the_end_of_the_meal|DATE', 0.7605),
 ('dishes|NOUN', 0.7605),
 ('kitchen|NOUN', 0.7564),
 ('cook|NOUN', 0.7558),
 ('head_chef|NOUN', 0.7529),
 ('food_order|NOUN', 0.7503),
 ('restaraunt|NOUN', 0.7503),
 ('dishwasher|NOUN', 0.7483),
 ('waitress/waiter|NOUN', 0.7477),
 ('dish_pit|NOUN', 0.7457),
 ('dishie|NOUN', 0.7414),
 ('big_tip|NOUN', 0.7377),
 ('waitresses|NOUN', 0.7374),
 ('barista|NOUN', 0.7365),
 ('fast_food_restaurant|NOUN', 0.7347),
 ('dish_washer|NOUN', 0.7335),
 ('resteraunt|NOUN', 0.7293),
 ('restaurant_manager|NOUN', 0.7286),
 ('dirty_plate|NOUN', 0.7284),
 ('cashier|NOUN', 0.7275),
 ('waiting_staff|NOUN', 0.7267),
 ('line_cook|NOUN', 0.7256),
 ('entire_restaurant|NOUN', 0.7256),
 ('

In [18]:
s2v.most_similar(["clean|ADJ"], n=50)

[('clean|NOUN', 0.8294),
 ('spotless|ADJ', 0.8173),
 ('Clean|GPE', 0.8056),
 ('Clean|ADJ', 0.8026),
 ('clean|VERB', 0.7935),
 ('cleaned|VERB', 0.7819),
 ('tidy|ADJ', 0.7745),
 ('squeaky|ADJ', 0.7742),
 ('clean|ADV', 0.767),
 ('dirty|ADJ', 0.765),
 ('sanitary|ADJ', 0.7515),
 ('cleaner|ADV', 0.7466),
 ('oiled|ADJ', 0.7354),
 ('scrub|VERB', 0.7346),
 ('cleaning|VERB', 0.7341),
 ('dry|ADJ', 0.7309),
 ('grimy|ADJ', 0.7303),
 ('little_dirty|NOUN', 0.729),
 ('scrubbing|VERB', 0.7277),
 ('cleaner|NOUN', 0.7274),
 ('pristine|ADJ', 0.7273),
 ('Clean|VERB', 0.7267),
 ('spotlessly|ADV', 0.7228),
 ('cleans|VERB', 0.7201),
 ('cleaning|NOUN', 0.715),
 ('washing|NOUN', 0.7113),
 ('oiled|VERB', 0.706),
 ('messy|ADJ', 0.7045),
 ('tidy|ADV', 0.7043),
 ('dry|ADV', 0.7022),
 ('CLEAN|ORG', 0.7016),
 ('wash|VERB', 0.7007),
 ('good_scrubbing|NOUN', 0.7),
 ('hygenic|ADJ', 0.6999),
 ('manky|ADJ', 0.6996),
 ('disinfected|VERB', 0.6983),
 ('Cleans|VERB', 0.6965),
 ('slick|ADJ', 0.6955),
 ('hygienic|ADJ', 0.6953),