In [51]:
import pandas as pd

ngrams = pd.read_csv('keywords.csv')

In [54]:
ngrams.head()

Unnamed: 0,0,1
0,great food great,4.036238e-10
1,good food good,4.858313e-10
2,food great service,6.220824e-10
3,good food great,6.725268e-10
4,food great food,6.960949e-10


In [3]:
ngrams.columns = ['n-gram', 'relevance']

In [56]:
ngrams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   n-gram     10000 non-null  object 
 1   relevance  10000 non-null  float64
dtypes: float64(1), object(1)
memory usage: 156.4+ KB


In [57]:
stopwords = ['good', 'great', 'amazing', 'love', 'nice', 'excellent', 'awesome','restaurant', 'google']

In [58]:
from collections import OrderedDict
def RemoveStop(row):
  tokens = row.split()
  tokens = list(OrderedDict.fromkeys(tokens))
  return ' '.join([t for t in tokens if t not in stopwords])


In [59]:
ngrams['n-gram'] = ngrams['n-gram'].apply(RemoveStop)

In [60]:
ngrams.head(10)

Unnamed: 0,n-gram,relevance
0,food,4.036238e-10
1,food,4.858313e-10
2,food service,6.220824e-10
3,food,6.725268e-10
4,food,6.960949e-10
5,food,8.671508e-10
6,service food,8.909668e-10
7,service food,9.574637e-10
8,food service,1.007647e-09
9,food,1.063216e-09


In [61]:
# Go thru each row of ngrams. If we haven't seen the string before, append to the list
unique_ngrams = {}

def sameTokens(s1, s2):

  return (sorted(s1.split()) == sorted(s2.split()))

print(sameTokens("food service", "service food"))

def filterUnique(n, r):
  # If n-gram with same tokens already exists, exit
  for k in unique_ngrams.keys():
    if sameTokens(n, k):
      return
  # Otherwise, add n-gram to list of unique n-grams
  unique_ngrams[n] = r


ngrams.apply(lambda x : filterUnique(x['n-gram'], x['relevance']), axis=1)
# remove random empty n-gram
unique_ngrams.pop('')

True


5.4801104909919256e-08

**NOTE: This is the list of most relevant n-grams with all the fluff removed. Still TBD how we will group these n-grams together. Also how we will detect topics in sentences from reviews...**

In [62]:
for k in list(unique_ngrams.keys()):
  print(k)

food
food service
service
place food
customer service
place
food friendly
staff food
service place
food prices
delicious food
food fast service
food friendly service
food fast
back food
time food
service delicious food
food friendly staff
friendly service
food customer
staff
customer service food
fast food place
food atmosphere
friendly staff
pretty food
fast service
people food
mexican food
food original
fresh food
pizza
prices
quality food
atmosphere
eat food
service delicious
food clean
staff service
food drinks
people
fast friendly service
service friendly staff
place delicious food
time service
experience food
order food
staff place
time
food quick service
service prices
super food
burgers
delicious place
food beer
menu food
drinks
back service
food quick
service original
tacos food
sacramento food
burgers food
food tastes
back place
location food
food foods
service pizza
wait food
burger food
food wonderful
friendly place
atmosphere service
food portions
recommend food
service fr

# Testing Gensim for identifying topics in sentences

In [52]:
reviews = pd.read_json('data/ys-reviews-with-categories.json')

restaurants = reviews.loc[reviews.category == "restaurant"]

In [53]:
restaurants['text'].head(10)

175    Decent Chinese Food.\n\nThe Hunan items are my...
176    We've been coming here for well over 15 years....
177    Great food, decent prices.  You get A lot of f...
178    Another great meal. Great service dine in or d...
179    Great authentic sezchuan style food. Family st...
180    Thanked me for coming in, very friendly, ready...
181         Great food and service at a reasonable price
182        Friendly service and tasty, inexpensive food.
183    Great prices for quality food. Service is A+ too.
184    Great customer service! Food is not that great...
Name: text, dtype: object

In [68]:
import nltk
nltk.sent_tokenize(data)

['It is always enjoyable at Zocalo.',
 'I usually have the burrito but decided to have the camarone tacos, good choice.',
 'The table also had the tacos Americana, very good as well.',
 'Considering COVID restrictions service was good and we all felt safe.',
 'A good time for sure.']

In [69]:
import gensim.downloader as api
# Credit: https://nlp.stanford.edu/projects/glove/
key_vec = api.load('glove-wiki-gigaword-100')

In [70]:
# Classify if a phrase is similar to the words in a list
# comp is a list of words related to a category, e.g. restaurant-related
def is_it_similar(phrase, comp ,threshold):
  words = phrase.split(' ')
  for w in words:
    for c in comp:
      try:
        sim_score = key_vec.similarity(w, c)
      except KeyError:
        sim_score = 0
      if sim_score > threshold:
        return True
  return False

In [76]:
sentences = nltk.sent_tokenize(data)
for s in sentences:
  print(is_it_similar(s, ['food'], threshold=0.5))

False
True
True
True
True


In [83]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

stop_words = stopwords.words("english")
    
def removed_stopwords(text: str):
    # make lower case and remove all punctuations
    text = text.lower().translate(str.maketrans('', '', punctuation))
    tokens = word_tokenize(text)
    # filter stopwords    
    return [word for word in tokens if word not in stop_words]

corpus = removed_stopwords("It is always enjoyable at Zocalo.  I usually have the burrito but decided to have the camarone tacos, good choice.  The table also had the tacos Americana, very good as well.  Considering COVID restrictions service was good and we all felt safe.  A good time for sure.")

Observation: GloVE does not seem to be associate food names with the word 'food' very well. Instead, it seems to return words often used with food. When someone mentions a specific food, they are less likely to explicitly say "food"

In [88]:
for w in corpus:
  try:
    sim_score = key_vec.similarity(w, 'food')
  except KeyError:
    sim_score = 0
  print(w, ' ', sim_score)
print(key_vec.similarity('chicken', 'food'))

always   0.40951824
enjoyable   0.17299005
zocalo   -0.008293062
usually   0.47720778
burrito   0.12126965
decided   0.3734138
camarone   0
tacos   0.22753528
good   0.502181
choice   0.47371292
table   0.45104885
also   0.47900337
tacos   0.22753528
americana   0.055896163
good   0.502181
well   0.5782533
considering   0.33139908
covid   0
restrictions   0.41900745
service   0.44641882
good   0.502181
felt   0.3108178
safe   0.5820462
good   0.502181
time   0.45150104
sure   0.4863005
0.5649143


In [124]:

key_vec.most_similar(positive=['staff', 'service','waitress'], negative=['officer'], topn=10)

[('job', 0.6257901191711426),
 ('working', 0.606554388999939),
 ('employees', 0.5995060801506042),
 ('worker', 0.5991743206977844),
 ('dining', 0.5929757952690125),
 ('services', 0.5781890749931335),
 ('breakfast', 0.5769687294960022),
 ('guest', 0.5760219097137451),
 ('guests', 0.575421154499054),
 ('hostess', 0.57537841796875)]

# Testing Doc2Vec embeddings

In [125]:
# Credit: https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Sample corpus
documents = ["The food is ok at this spot however the customer service could use some improvement. \
The employees are to busy chatting with each other while customers are waiting for their \
food that is just sitting there ready to go. Hopefully the management sees this and teaches \
their employees to do better.",
"Thanked me for coming in, very friendly, ready with the food in 15 min. \
for takeout and it was delicious! Great food and friendly service, I'd recommend to anybody.",
"I tried this place right when they opened and everything was perfect. \
Crisp crust, fresh toppings. Lots of menu items to choose from other than pizza.\
I’ve been back a handful of times and every order has been on point. Really nice spot.",
"Pizza and wings were both delicious. Best pizza I’ve had in a while. \
We will ABSOLUTELY be returning even though it is not nearby. \
I left a Yelp review and am leaving one here too because I feel that strongly. Dang good.",
"This location has an entirely new staff committed to excellent service and product. \
Kudos to management for seeing the need for change in this location. \
I highly recommend you try this location again.",
"The food is decent, but staff are talking a lot among themselves, leading to delays.",
"I love how wonderful the staff and customer service are at this restaurant.\
I'm never disappointed when I leave!"
]

documents_df=pd.DataFrame(documents,columns=['documents'])

# remove special characters and stop words
stop_words_l=stopwords.words('english')
documents_df['documents_cleaned']=documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )

tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(documents_df.documents_cleaned)]
model_d2v = Doc2Vec(vector_size=100,alpha=0.025, min_count=1)
  
model_d2v.build_vocab(tagged_data)

for epoch in range(100):
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.epochs)
    
document_embeddings=np.zeros((documents_df.shape[0],100))

for i in range(len(document_embeddings)):
    document_embeddings[i]=model_d2v.docvecs[i]
    
    
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)

# doc id is the index of document in corpus that we use as reference
# which documents in the corpus are most similar to the given document?
def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Document: {documents_df.iloc[doc_id]["documents"]}')
    print ('\n')
    print ('Similar Documents:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Document: {documents_df.iloc[ix]["documents"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oliviashen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oliviashen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  document_embeddings[i]=model_d2v.docvecs[i]


In [105]:
most_similar(5,pairwise_similarities,'Cosine Similarity')
most_similar(5,pairwise_differences,'Euclidean Distance')

Document: The food is decent, but staff are talking a lot among themselves, leading to delays.


Similar Documents:


Document: I love how wonderful the staff and customer service are at this restaurant.I'm never disappointed when I leave!
Cosine Similarity : 0.43725193967340836


Document: This location has an entirely new staff committed to excellent service and product. Kudos to management for seeing the need for change in this location. I highly recommend you try this location again.
Cosine Similarity : 0.41302499293439804


Document: Thanked me for coming in, very friendly, ready with the food in 15 min. for takeout and it was delicious! Great food and friendly service, I'd recommend to anybody.
Cosine Similarity : 0.4038652428632752


Document: Pizza and wings were both delicious. Best pizza I’ve had in a while. We will ABSOLUTELY be returning even though it is not nearby. I left a Yelp review and am leaving one here too because I feel that strongly. Dang good.
Cosine Similarity 

# Testing Sense2Vec Embeddings

In [126]:
%pip install sense2vec

Collecting sense2vec
  Downloading sense2vec-2.0.2-py2.py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m606.6 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting spacy<4.0.0,>=3.0.0 (from sense2vec)
  Obtaining dependency information for spacy<4.0.0,>=3.0.0 from https://files.pythonhosted.org/packages/ca/f3/609bb7512cad1f02af13daa23aa433b931da34c502211f29fd47dceff624/spacy-3.7.2-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading spacy-3.7.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting wasabi<1.2.0,>=0.8.1 (from sense2vec)
  Obtaining dependency information for wasabi<1.2.0,>=0.8.1 from https://files.pythonhosted.org/packages/8f/69/26cbf0bad11703241cb84d5324d868097f7a8faf2f1888354dac8883f3fc/wasabi-1.1.2-py3-none-any.whl.metadata
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.0 (from sense2vec)
  Obtaining dependency information for srsly<3.0.0,>=2.4

**The Sense2Vec pre-trained Reddit model is much better with food-related vocabulary**

Go to this link to download a zip archive of the Reddit Sense2Vec model: https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz

In [5]:
from sense2vec import Sense2Vec
s2v = Sense2Vec().from_disk("s2v_old")
most_similar = s2v.most_similar("food|NOUN", n=10)

In [6]:
s2v.most_similar("food|NOUN", n=100)
s2v.most_similar("restaurant|NOUN", n=50)
s2v.most_similar("waiter|NOUN", n=50)
# if more than one key is provided, the average of the vectors is used
s2v.most_similar(["mexican_food|NOUN", "waiter|NOUN"], n=50)
s2v.most_similar("Chipotle|ORG", n=10)
s2v.most_similar(["clean|ADJ", "waiter|NOUN", "staff|NOUN"], n=50)

[('kitchen_staff|NOUN', 0.8127),
 ('wait_staff|NOUN', 0.8055),
 ('busboy|NOUN', 0.7949),
 ('waitress|NOUN', 0.7932),
 ('bus_boy|NOUN', 0.7879),
 ('restaurant|NOUN', 0.7863),
 ('waitstaff|NOUN', 0.7852),
 ('head_chef|NOUN', 0.7841),
 ('hostess|NOUN', 0.7827),
 ('restaurant_manager|NOUN', 0.7825),
 ('FOH_manager|NOUN', 0.7781),
 ('other_staff|NOUN', 0.7697),
 ('waiter/waitress|NOUN', 0.7692),
 ('manager|NOUN', 0.7631),
 ('restaraunt|NOUN', 0.7619),
 ('housekeeping_staff|NOUN', 0.7619),
 ('bartender|NOUN', 0.7608),
 ('whole_staff|NOUN', 0.7579),
 ('waiting_staff|NOUN', 0.7573),
 ('bar_staff|NOUN', 0.7545),
 ('receptionist|NOUN', 0.754),
 ('kitchen_manager|NOUN', 0.7515),
 ('waitresses|NOUN', 0.751),
 ('bartenders|NOUN', 0.7501),
 ('cook|NOUN', 0.7495),
 ('cleaning_staff|NOUN', 0.7484),
 ('barista|NOUN', 0.7472),
 ('restaurant_staff|NOUN', 0.7467),
 ('service_staff|NOUN', 0.7463),
 ('house_staff|NOUN', 0.7455),
 ('cooks|NOUN', 0.7453),
 ('hotel_staff|NOUN', 0.7443),
 ('dish_pit|NOUN', 0.74

In [18]:
s2v.most_similar(["clean|ADJ"], n=50)

[('clean|NOUN', 0.8294),
 ('spotless|ADJ', 0.8173),
 ('Clean|GPE', 0.8056),
 ('Clean|ADJ', 0.8026),
 ('clean|VERB', 0.7935),
 ('cleaned|VERB', 0.7819),
 ('tidy|ADJ', 0.7745),
 ('squeaky|ADJ', 0.7742),
 ('clean|ADV', 0.767),
 ('dirty|ADJ', 0.765),
 ('sanitary|ADJ', 0.7515),
 ('cleaner|ADV', 0.7466),
 ('oiled|ADJ', 0.7354),
 ('scrub|VERB', 0.7346),
 ('cleaning|VERB', 0.7341),
 ('dry|ADJ', 0.7309),
 ('grimy|ADJ', 0.7303),
 ('little_dirty|NOUN', 0.729),
 ('scrubbing|VERB', 0.7277),
 ('cleaner|NOUN', 0.7274),
 ('pristine|ADJ', 0.7273),
 ('Clean|VERB', 0.7267),
 ('spotlessly|ADV', 0.7228),
 ('cleans|VERB', 0.7201),
 ('cleaning|NOUN', 0.715),
 ('washing|NOUN', 0.7113),
 ('oiled|VERB', 0.706),
 ('messy|ADJ', 0.7045),
 ('tidy|ADV', 0.7043),
 ('dry|ADV', 0.7022),
 ('CLEAN|ORG', 0.7016),
 ('wash|VERB', 0.7007),
 ('good_scrubbing|NOUN', 0.7),
 ('hygenic|ADJ', 0.6999),
 ('manky|ADJ', 0.6996),
 ('disinfected|VERB', 0.6983),
 ('Cleans|VERB', 0.6965),
 ('slick|ADJ', 0.6955),
 ('hygienic|ADJ', 0.6953),

In [15]:
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
import spacy
from sense2vec import Sense2VecComponent

nlp = spacy.load('en_core_web_sm', disable=['ner'])
'''
s2v = nlp.add_pipe("sense2vec")
s2v.from_disk("s2v_old")
'''

'\ns2v = nlp.add_pipe("sense2vec")\ns2v.from_disk("s2v_old")\n'

In [46]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'sense2vec']

In [7]:
#from spacy.attrs import ORTH
# you can add special cases to the tokenizer if there are phrases that are semantically significant
#nlp.tokenizer.add_special_case("customer service", [{ORTH: "customer service"}])
'''doc = nlp("The food is ok at this spot however the customer service could use some improvement. \
The employees are to busy chatting with each other while customers are waiting for their \
food that is just sitting there ready to go. Hopefully the management sees this and teaches \
their employees to do better.")'''
doc = nlp("food")

In [8]:
from spacy import displacy
displacy.render(doc, style="dep")
# The builtin dependency model seems to think "customer service" is a propper noun

In [9]:
nouns = [token.text for token in doc if (token.pos_ == "NOUN" or token.pos_ == "PROPN")]
print(nouns)


['food']


In [4]:
vector = doc[9]._.s2v_most_similar(20)
vector

[(('costumer', 'NOUN'), 0.8794),
 (('customers', 'NOUN'), 0.8788),
 (('single customer', 'NOUN'), 0.8412),
 (('other customer', 'NOUN'), 0.8343),
 (('sales person', 'NOUN'), 0.8323),
 (('other customers', 'NOUN'), 0.8258),
 (('sales rep', 'NOUN'), 0.8191),
 (('CSR', 'ORG'), 0.819),
 (('customer service rep', 'NOUN'), 0.818),
 (('Customers', 'NOUN'), 0.8147),
 (('good customer', 'NOUN'), 0.814),
 (('many customers', 'NOUN'), 0.8118),
 (('salesperson', 'NOUN'), 0.8025),
 (('customer service', 'NOUN'), 0.8022),
 (('potential customer', 'NOUN'), 0.7982),
 (('retail employee', 'NOUN'), 0.7955),
 (('employee', 'NOUN'), 0.7898),
 (('costumers', 'NOUN'), 0.7849),
 (('good customer service', 'NOUN'), 0.7834),
 (('most customers', 'NOUN'), 0.7831)]

In [6]:
doc._.s2v_phrases

[food,
 spot,
 customer service,
 improvement,
 employees,
 customers,
 food,
 that,
 management,
 this,
 employees]

You can measure the similarity between tokens in the document

In [8]:
doc[9:11]._.s2v_similarity(doc[1])

0.35423326

Use `sents` attribute of document to get an iterator over the sentences. https://spacy.io/api/dependencyparser#assigned-attributes

In [10]:
itr = doc.sents
for i in itr:
  i._.s2v_similarity()

The food is ok at this spot however the customer service could use some improvement.
The employees are to busy chatting with each other while customers are waiting for their food that is just sitting there ready to go.
Hopefully the management sees this and teaches their employees to do better.


# Topic Detection with spaCy and sense2vec

In [1]:
import spacy
from spacy import Language
from sense2vec import Sense2Vec
s2v = Sense2Vec().from_disk("s2v_old")

In [2]:
# Create a pipe that converts lemmas to lower case:
@Language.component("lower_case_lemmas")
def lower_case_lemmas(doc) :
    for token in doc :
        token.lemma_ = token.lemma_.lower()
    return doc
# Initialize default spaCy pipeline
nlp = spacy.load('en_core_web_sm', disable=['ner'])
# lower_case_lemmas to pipeline
nlp.add_pipe(factory_name="lower_case_lemmas", after="tagger")
# Sanity check to make sure we have the right pipeline order
print(nlp.pipe_names)

['tok2vec', 'tagger', 'lower_case_lemmas', 'parser', 'attribute_ruler', 'lemmatizer']


In [29]:
# Perform spaCy pipeline tasks on a document
doc = nlp("The food is ok at this spot however the customer service could use some improvement. \
The employees are to busy chatting with each other while customers are waiting for their \
food that is just sitting there ready to go. Hopefully the management sees this and teaches \
their employees to do better. I love hotdogs and tacos.")

In [8]:
# Detect if a topic defined by topic_list is present in each sentence of a spaCy doc
def topicDetectionPrint(doc, topic_list : list[str]):
  for sentence in doc.sents:
    print("-"*50)
    for token in sentence:
      # Construct string to pass to Sense2Vec
      s = token.lemma_ + "|" + token.pos_
      # Only consider tokens that Sense2Vec model knows
      # Filter for nouns and adjectives
      if s in s2v and token.pos_ in ["NOUN", "ADJ"]:
        # calculate cosine similarity
        # Under the hood, Sense2Vec creates an average vector from each token in topic_list
        sim_score = s2v.similarity(s, topic_list)
        print(token.text, " | Similarity to topic:", sim_score)

### Test detecting Service topic

Sense2Vec is able to create an average token embedding (i.e. vector representation of a token) from a list of tokens. This is how we define the location of our topics in the embedding space. We are using an embedding space trained on Reddit comments, which performs well on food-related tokens. It also captures the informal syntax and vocab of internet comments, which are similar to restaurant reviews in Google Maps.

In [47]:
print("Topic: Service")
topicDetectionPrint(doc, ["waiter|NOUN", "staff|NOUN", "service|NOUN"])

Topic: Service
--------------------------------------------------
food  | Similarity to topic: 0.61955065
spot  | Similarity to topic: 0.44700202
customer  | Similarity to topic: 0.82073253
service  | Similarity to topic: 0.80309266
improvement  | Similarity to topic: 0.3148422
--------------------------------------------------
employees  | Similarity to topic: 0.7616242
customers  | Similarity to topic: 0.82073253
food  | Similarity to topic: 0.61955065
--------------------------------------------------
management  | Similarity to topic: 0.7012463
employees  | Similarity to topic: 0.7616242
--------------------------------------------------
hotdogs  | Similarity to topic: 0.502859
tacos  | Similarity to topic: 0.38666075


- The tokens related to service tend to have cosine similarity above 0.7

### Test detecting Food topic

In [48]:
print("Topic: Food")
topicDetectionPrint(doc, ["food|NOUN", "pizza|NOUN", "meal|NOUN", "taco|NOUN"])

Topic: Food
--------------------------------------------------
food  | Similarity to topic: 0.8265475
spot  | Similarity to topic: 0.36757043
customer  | Similarity to topic: 0.5351522
service  | Similarity to topic: 0.40503314
improvement  | Similarity to topic: 0.17086305
--------------------------------------------------
employees  | Similarity to topic: 0.3918952
customers  | Similarity to topic: 0.5351522
food  | Similarity to topic: 0.8265475
--------------------------------------------------
management  | Similarity to topic: 0.28171706
employees  | Similarity to topic: 0.3918952
--------------------------------------------------
hotdogs  | Similarity to topic: 0.8435375
tacos  | Similarity to topic: 0.8664133


# Testing on multiple reviews

In [15]:
import pandas as pd

reviews = pd.read_json('data/ys-reviews-with-categories.json')

restaurants_short = reviews.loc[reviews.category == "restaurant"].head(1000).text.to_list()

In [16]:
docs_short = list(nlp.pipe(restaurants_short))

In [33]:
for i, doc in enumerate(docs_short):
  print("Review #", i)
  topicDetectionPrint(doc, ["price|NOUN", "worth|NOUN", "value|NOUN"])
  print()

Review # 0
--------------------------------------------------
Decent  | Similarity to topic: 0.46931952
--------------------------------------------------
items  | Similarity to topic: 0.63807464
favorite  | Similarity to topic: 0.2774298

Review # 1
--------------------------------------------------
years  | Similarity to topic: 0.5123558
--------------------------------------------------
management  | Similarity to topic: 0.40426862
favorite  | Similarity to topic: 0.27789596
Chinese  | Similarity to topic: 0.30783686
food  | Similarity to topic: 0.4750182
spot  | Similarity to topic: 0.43356362
--------------------------------------------------
favorites  | Similarity to topic: 0.2774298
chicken  | Similarity to topic: 0.26935804
--------------------------------------------------
A+  | Similarity to topic: 0.28057608
customer  | Similarity to topic: 0.565086
service  | Similarity to topic: 0.53812504
great  | Similarity to topic: 0.3983354
food  | Similarity to topic: 0.4750182
grea

In [13]:
doc2 = nlp("restaurant cashier waiter service")
topicDetectionPrint(doc2, ["waiter|NOUN", "staff|NOUN", "service|NOUN", "employee|NOUN", "job|NOUN"])

--------------------------------------------------
restaurant  | Similarity to topic: 0.7706513
cashier  | Similarity to topic: 0.7453462
service  | Similarity to topic: 0.768464


### Run these:

In [1]:
import pandas as pd
import spacy
from spacy import Language
from sense2vec import Sense2Vec
s2v = Sense2Vec().from_disk("s2v_old")
# Create a pipe that converts lemmas to lower case:
@Language.component("lower_case_lemmas")
def lower_case_lemmas(doc) :
    for token in doc :
        token.lemma_ = token.lemma_.lower()
    return doc
# Initialize default spaCy pipeline
nlp = spacy.load('en_core_web_sm', disable=['ner'])
# lower_case_lemmas to pipeline
nlp.add_pipe(factory_name="lower_case_lemmas", after="tagger")
# Sanity check to make sure we have the right pipeline order
print(nlp.pipe_names)

['tok2vec', 'tagger', 'lower_case_lemmas', 'parser', 'attribute_ruler', 'lemmatizer']


**ONLY DO THIS ONCE!** Save documents (i.e. the processed review text) to file. Takes ~45 minutes

In [2]:
from spacy.tokens import DocBin

reviews = pd.read_json('data/ys-reviews-restaurants.json', orient='records')

docs = list(nlp.pipe(reviews['text'].to_list()))

doc_bin = DocBin(docs=docs, store_user_data=True, attrs=["ORTH", "TAG", "DEP", "LEMMA", "MORPH", "POS"])
doc_bin.to_disk("data/restaurant-reviews.spacy")

If you already generated `restaurant-reviws.spacy`, simply read the file to generate list of docs in memory. Takes ~ 4 minutes. NOTE: The docbin doesn't save sentence boundaries :(

In [9]:
from spacy.tokens import DocBin
doc_bin = DocBin().from_disk("data/restaurant-reviews.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

In [4]:
# Detect if a topic defined by topic_list is present in a sentence (span from spaCy doc)
# If a doc has n sentences, return a list of n booleans, where each index represent a topic present or not
# pos is a list of parts of speech to consider from doc
# thresh is a threshold for cosine similarity. If similarity > threshold, topic is present
def topicDetection(sentence, topic_list : list[str], pos : list[str], thresh) -> list[int]:
    indices = []
    for i, token in enumerate(sentence):
      # Construct string to pass to Sense2Vec
      s = token.lemma_ + "|" + token.pos_
      # Only consider tokens that Sense2Vec model knows
      # Filter for nouns and adjectives
      if (s in s2v and token.pos_ in pos) and (s2v.similarity(s, topic_list) > thresh):
        indices.append(i)
    return indices
    

In [4]:
# Operates like TopicDetection, except looks or matches to each string in topics_list seperately
# Instead of averaging their vector representations
def seperateTopicsDetection(sentence, topics_list : list[str], thresh, exclude_pos = []) -> list[int]:
    indices = []
    for i, token in enumerate(sentence):
      # Skip token if explicitly told to ignore part of speech
      if token.pos_ in exclude_pos:
        continue
      # Construct string to pass to Sense2Vec
      s = token.lemma_ + "|" + token.pos_
      # Only consider tokens that Sense2Vec model knows
      # Filter for nouns and adjectives
      if s in s2v:
        # Add to indices list if token matches at least one topic from topic_list
        for topic in topics_list:
          if s2v.similarity(s, topics_list) > thresh:
            indices.append(i)
            break
    return indices

## Detect Food topic

In [6]:
food = ["food|NOUN", "pizza|NOUN", "meal|NOUN", "taco|NOUN", "chinese|ADJ", "mexican|ADJ", "sushi|NOUN", "bone|NOUN", "drink|NOUN", "pho|NOUN", "curry|NOUN", "coffee|NOUN", "teriyaki|NOUN"]
food_hits = []
for i, doc in enumerate(docs):
  for j, sentence in enumerate(doc.sents):
    for k in topicDetection(sentence, food, ["NOUN", "ADJ"], 0.6):
      # for each token where the food topic is detected
      # record lemma, doc index, sentence index, and token index
      food_hits.append([sentence[k].lemma_ , i, j, k])

In [7]:
food_hits = pd.DataFrame(data=food_hits, columns=['lemma', 'doc_index', 'sentence_index', 'token_index'])
food_hits.to_json('data/topics/food-hits-restaurant-reviews.json', orient='records')

In [10]:
# How many reviews mention food
food_hits.doc_index.nunique()

471795

In [11]:
# Total number of reviews
reviews.shape

(678759, 3)

There are 471,795 reviews out of 678,759 that mention food. That's ~2/3 of reviews.

## Detect Service Topic

In [5]:
service = ["waiter|NOUN", "staff|NOUN", "service|NOUN", "employee|NOUN"]
service_hits = []
for i, doc in enumerate(docs):
  for j, sentence in enumerate(doc.sents):
    for k in topicDetection(sentence, service, ["NOUN", "ADJ"], 0.7):
      # for each token where the food topic is detected
      # record lemma, doc index, sentence index, and token index
      service_hits.append([sentence[k].lemma_ , i, j, k])

Bug: the word "restaurant" is very similar to the service topic, but shouldn't be included. Filter out instances of restaurant.

In [21]:
remove = ["restaurant", "restraunt", "restaraunt"]
service_hits = pd.DataFrame(data=service_hits, columns=['lemma', 'doc_index', 'sentence_index', 'token_index'])
# Remove "restaurant" or any typos from service hits
service_hits = service_hits[~((service_hits['lemma'] == "restaurant") | (service_hits['lemma'] == "restraunt") | (service_hits['lemma'] == "restaraunt"))]
service_hits.to_json('data/topics/service-hits-restaurant-reviews.json', orient='records')

## Detect Location Topic

In [5]:
location = ["crowded|ADJ", "atmosphere|NOUN", "quiet|ADJ", "interior|NOUN", "music|NOUN", "environment|NOUN", "space|NOUN", "vibe|NOUN", "location|NOUN"]
location_hits = []
for i, doc in enumerate(docs):
  for j, sentence in enumerate(doc.sents):
    for k in seperateTopicsDetection(sentence, location, 0.67):
      # for each token where the food topic is detected
      # record lemma, doc index, sentence index, and token index
      location_hits.append([sentence[k].lemma_ , i, j, k])

In [6]:
location_hits = pd.DataFrame(data=location_hits, columns=['lemma', 'doc_index', 'sentence_index', 'token_index'])
location_hits = location_hits[~((location_hits['lemma'] == "especially"))]
location_hits.to_json('data/topics/location-hits-restaurant-reviews.json', orient='records')

## Detect Clean Topic

In [7]:
clean = ["clean|ADJ", "dirty|ADJ", "fly|NOUN", "cockroach|NOUN", "filthy|ADJ", "spotless|ADJ"]
clean_hits = []
for i, doc in enumerate(docs):
  for j, sentence in enumerate(doc.sents):
    for k in seperateTopicsDetection(sentence, clean, 0.7):
      # for each token where the food topic is detected
      # record lemma, doc index, sentence index, and token index
      clean_hits.append([sentence[k].lemma_ , i, j, k])

In [8]:
clean_hits = pd.DataFrame(data=clean_hits, columns=['lemma', 'doc_index', 'sentence_index', 'token_index'])
# clean_hits = clean_hits[~((location_hits['lemma'] == "especially"))]
clean_hits.to_json('data/topics/clean-hits-restaurant-reviews.json', orient='records')

## Detect Price Topic

In [9]:
price = ["cheap|ADJ", "expensive|ADJ", "price|NOUN", "worth|NOUN", "payment|NOUN", "tip|NOUN"]
price_hits = []
for i, doc in enumerate(docs):
  for j, sentence in enumerate(doc.sents):
    # exclude verbs like "pay" or "buy"
    for k in seperateTopicsDetection(sentence, price, 0.7, ["VERB"]):
      # for each token where the food topic is detected
      # record lemma, doc index, sentence index, and token index
      price_hits.append([sentence[k].lemma_ , i, j, k])

In [11]:
price_hits = pd.DataFrame(data=price_hits, columns=['lemma', 'doc_index', 'sentence_index', 'token_index'])
# clean_hits = clean_hits[~((location_hits['lemma'] == "especially"))]
price_hits.to_json('data/topics/price-hits-restaurant-reviews.json', orient='records')

## Misc Cells

In [61]:
reviews = pd.read_json('data/ys-reviews-with-categories.json')

restaurants = reviews.loc[reviews.category == "restaurant"]
restaurants.drop(columns=['category'], inplace=True)
restaurants.to_json('data/ys-reviews-restaurants.json', orient='records')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurants.drop(columns=['category'], inplace=True)


In [48]:
food = ["food|NOUN", "pizza|NOUN", "meal|NOUN", "taco|NOUN", "chinese|ADJ", "mexican|ADJ", "sushi|NOUN", "bone|NOUN", "drink|NOUN", "pho|NOUN", "curry|NOUN", "coffee|NOUN", "teriyaki|NOUN"]
doc_list = []
for i, doc in enumerate(docs):
  sentence_list = []
  for sentence in doc.sents:
    sentence_list.append(topicDetection(sentence, food, ["NOUN", "ADJ"], 0.6))
  doc_list.append(sentence_list)

In [49]:
for doc in doc_list:
  print(doc)

[[], []]
[[], [14], [8], [5]]
[[1], [5], [1, 3]]
[[2], [], []]
[[4], [], [3]]
[[12, 18, 22], [1]]
[[1]]
[[3, 6]]
[[4], []]
[[], [0], [], [5]]
[[], [], []]
[[], [2, 4]]
[[], [1]]
[[1], [15], []]
[[1, 4], []]
[[]]
[[], [9, 10], [], [], []]
[[1, 2, 6, 9, 12]]
[[], [14]]
[[], [2], [0, 2, 9]]
[[8], [2]]
[[5, 8, 10, 17], [18], []]
[[]]
[[1]]
[[], [], [], []]
[[]]
[[]]
[[]]
[[5, 12], [1, 2, 9, 10, 12], [], [], []]
[[], [2], [], [6], [], [], [], [], [1]]
[[3], [], [], []]
[[4], [4]]
[[4], []]
[[]]
[[]]
[[0]]
[[], []]
[[], [14], [8], [5]]
[[1], [5], [1, 3]]
[[2], [], []]
[[4], [], [3]]
[[12, 18, 22], [1]]
[[1]]
[[3, 6]]
[[4], []]
[[], [0], [], [5]]
[[], [], []]
[[], [2, 4]]
[[], [1]]
[[1], [15], []]
[[1, 4], []]
[[]]
[[], [9, 10], [], [], []]
[[1, 2, 6, 9, 12]]
[[], [14]]
[[], [2], [0, 2, 9]]
[[8], [2]]
[[5, 8, 10, 17], [18], []]
[[]]
[[1]]
[[], [], [], []]
[[]]
[[]]
[[]]
[[5, 12], [1, 2, 9, 10, 12], [], [], []]
[[], [2], [], [6], [], [], [], [], [1]]
[[3], [], [], []]
[[4], [4]]
[[4], []]
[[]]

In [51]:
# docs is a list of Doc objects
for doc_idx, doc in enumerate(docs):
  # sent is an iterable of sentences in doc (i.e. a span of doc)
  print("Document", doc_idx)
  for sent_idx, sent in enumerate(doc.sents):
    # for that sentence's index and document, look in doc list for hits
    for tok_idx in doc_list[doc_idx][sent_idx]:
      # print the token in sent that triggered a topic hit
      print(sent[tok_idx].lemma_)
  print()
      

Document 0

Document 1
food
chicken
food

Document 2
food
food
dinner
lunch

Document 3
meal

Document 4
food
restaurant

Document 5
food
takeout
delicious
food

Document 6
food

Document 7
tasty
food

Document 8
food

Document 9
food
soup

Document 10

Document 11
fatty
chewy

Document 12
fry

Document 13
food
food

Document 14
food
crispy

Document 15

Document 16
chicken
fry

Document 17
chicken
sandwich
tender
food
kfc

Document 18
delicious

Document 19
fry
chicken
fry
flavor

Document 20
burger
food

Document 21
burger
fry
drink
drink
drink

Document 22

Document 23
milkshake

Document 24

Document 25

Document 26

Document 27

Document 28
food
food
slice
pizza
pizza
slice
cheese

Document 29
mexican
food
costco

Document 30
food

Document 31
food
delicious

Document 32
bite

Document 33

Document 34

Document 35
lunch

Document 36

Document 37
food
chicken
food

Document 38
food
food
dinner
lunch

Document 39
meal

Document 40
food
restaurant

Document 41
food
takeout
delicious
