In [21]:
import json
import os
import pandas as pd
import nltk
import gensim as gs
import pickle
import pyLDAvis.gensim
import math

In [2]:
# Reading in the preprocessed reviews from Task 1
restaurant_review = pd.read_csv("../restaurant_reviews_processed.csv", sep=":")
restaurant_review["categories"] = restaurant_review["categories"].apply(lambda x: set(eval(x)))
restaurant_review["review"] = restaurant_review["review"].apply(eval)

In [3]:
#Manually created list of 15 cuisines from the categories. Chose these from the ones with the most reviews.
cuisines = {"American (New)", "Mexican", "Italian", "Steakhouses", "Japanese", "Chinese", "Sushi Bars", "Seafood", "Fast Food", "Thai" ,"Asian Fusion", "Mediterranean", "Barbeque", "French", "Cafes"}

In Task 1 we saw that using _lift_ to caculate the relevance of a word when categorised by cuisine generated at least one topic that was distinctly representative of the cuisine. Hence I have decided to use the same method to describe a cuisine. Instead of using a separate model for each cuisine where 1 topic describes the cuisine I have attempted to generate 15 topics for a set of reviews that pertain to 15 cuisines and I am hoping that I will be able to get 1 topic per cuisine.

In [15]:
#Creating a sample set of reviews by sampling 10000 rows from each cuisine resulting in a set of 15 * 10000 rows.
rows_by_cuisine = []
for c in cuisines:
    rows_by_cuisine.append(restaurant_review[restaurant_review["categories"].map(lambda x: c in x)].sample(10000))

training_set = pd.concat(rows_by_cuisine)
training_set

Unnamed: 0.1,Unnamed: 0,user_id,stars_review,date,review,business_id,categories,city,review_count,name,state,stars_restaurant
60328,99907,KkEFlQAmrKf6BtPVi6EU7g,4,2011-01-05,"[long, airport, felt, throat, scratchy, needed...",FV16IeXJp2W6pnghTz2FAw,"{Chinese, Vietnamese, Restaurants}",Las Vegas,1314,Pho Kim Long,NV,3.5
294912,476274,6dM4_zDchbQnfoAH4txrdQ,1,2012-10-20,"[hour, cold, sauce, fortune, cookies, called, ...",ZWcZ77EQZqNVwRSfBubX7A,"{Chinese, Restaurants}",North Las Vegas,25,China A Go Go,NV,3.0
22175,35725,8r_J4mpyVNXThn90uGp_MA,5,2010-07-20,"[judge, book, cover, momma, told, growin, hold...",LGLOtR2e-v74MpB51k8hxQ,"{Buffets, Chinese, Restaurants}",Phoenix,21,Lim's Chinese Buffet,AZ,3.5
91132,150638,lLY75QmKzN-qxmey_AzcyQ,1,2013-10-06,"[springroll, veggies, thing, thai, volcano, be...",LqfCXOcyNPzDtBgy5M9VFA,"{Chinese, Restaurants}",Phoenix,42,Abacus Inn,AZ,4.0
577185,923325,d5-mxnQioBZ65ZNe6WqR4A,5,2013-12-26,"[excellent, chinese, cousins, mt.charleston, t...",1PQNIw4DgWTy_18cH5hVCA,"{Chinese, Restaurants}",Las Vegas,38,China One,NV,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
154492,256620,Xd_gK-S3bep9iJAJKZonSg,4,2011-07-07,"[honest, sick, buffets, vegas, addition, wicke...",bB1bVvOMkfvWFVbNmRl_VA,"{Steakhouses, Restaurants, Japanese, Sushi Bars}",Las Vegas,284,Hikari,NV,3.5
516966,831877,Nv_vtVcSctrUS9SGLzMh8A,5,2010-10-16,"[ramen, vegas, stationed, japan, finally, oishi]",DjOxXobyGDwWt89q4z1twg,"{Ramen, Japanese, Restaurants}",Las Vegas,1127,Monta Ramen,NV,4.0
33905,55927,bYZN1gwIQRS_miEPqeV9og,3,2011-12-29,"[times, eaten, forced, loud, techno, music]",5vgmIwBB48fwo_WC8vGiXA,"{Restaurants, Japanese, Sushi Bars}",Chandler,80,Ninja Japanese Restaurant,AZ,3.0
9326,14407,B-4_1ATtqCuFlKk1c24b2w,3,2014-01-10,"[star, seated, server, prompt, rating, based, ...",AkOruz5CrCxUmXe1p_WoRg,"{Italian, Asian Fusion, Japanese, Restaurants}",Phoenix,494,Cherryblossom Noodle Cafe,AZ,4.0


In [16]:
def create_lda_model(reviews, topic_count):
    dictionary = gs.corpora.Dictionary(reviews)
    corpus = [dictionary.doc2bow(text) for text in reviews]
    ldamodel = gs.models.ldamodel.LdaModel(corpus, num_topics = topic_count, id2word=dictionary, passes=10, alpha="auto")
    return (ldamodel, corpus, dictionary)

In [17]:
(model, corpus, dictionary) = create_lda_model(training_set["review"], 15)

In [18]:
lda_d = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_d)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


The topics were not segregated by cuisines as I had hoped. So I am now using simple word probabilities on the reviews after filtering them by the cuisine and using the pyLDAvis relevance metric to rank the words in the reviews

In [65]:
dict_b = gs.corpora.Dictionary(restaurant_review["review"])
total_word_cnt = restaurant_review["review"].apply(len).sum()

In [123]:
def get_wrd_prob_for_cuisine(cuisine):
    rows = restaurant_review[restaurant_review["categories"].map(lambda x: cuisine in x)]
    d = gs.corpora.Dictionary(rows["review"])
    d.filter_extremes(no_above=0.5, no_below=math.floor(0.05 * len(rows)))   # filter words present in more than 50% docs and less than 5% of reviews
    total_words_in_cuisine = rows["review"].apply(len).sum()
    return pd.Series({d[tokenid]: d.cfs[tokenid]/total_words_in_cuisine for tokenid in d.keys()})

In [124]:
res_thai = get_wrd_prob_for_cuisine("Thai")
res_thai.sort_values(ascending=False)

curry           0.012350
rice            0.009387
spicy           0.008782
lunch           0.008225
dish            0.007159
                  ...   
decor           0.001518
kind            0.001505
things          0.001500
inside          0.001488
disappointed    0.001423
Length: 101, dtype: float64

As you can see the words are fairly related to the thai cuisine. However, we can further improve the relevant word selection by using _lift_ as measure of the importance of a word in the topic. This will help reduce the importance of words that are common in all the reviews.

In [125]:
def calculate_and_sort_by_lift(word_prob):
    for word in word_prob.index:
        marginal_p_of_word = dict_b.cfs[dict_b.token2id[word]]/total_word_cnt
        word_prob[word] = word_prob[word]/marginal_p_of_word
        word_prob.sort_values(ascending=False, inplace=True)

In [126]:
calculate_and_sort_by_lift(res_thai)
res_thai.head(30)

panang         34.287433
curry          18.192718
yellow         10.891345
basil           9.708976
coconut         8.796578
spice           8.217093
level           7.380260
tofu            7.232515
noodles         6.889217
noodle          6.324116
spring          5.304330
heat            5.281234
spicy           5.043486
rice            4.126125
asian           3.452444
soup            3.444576
dish            3.314082
dishes          3.186487
authentic       2.992105
fried           2.910058
green           2.787787
lunch           2.265344
restaurants     2.102224
beef            2.085345
crispy          2.068942
rolls           2.051999
shrimp          1.943970
portion         1.879022
portions        1.727536
sweet           1.716124
dtype: float64

Now the words are far more related to the thai cuisine. Thus using lift as the relevance metric for a word in a topic is better able to characterise cuisines.

In [133]:
word_prob_by_cuisine = {}
for c in cuisines:
    p = get_wrd_prob_for_cuisine(c)
    calculate_and_sort_by_lift(p)
    word_prob_by_cuisine[c] = p.head(20)   #Taking the top 20 words as representative of a cuisine

In [134]:
word_prob_by_cuisine["Mexican"]

asada         10.659338
carne         10.325261
enchiladas    10.067042
mexican        9.729862
salsa          9.587581
burritos       9.566158
taco           9.493130
tortillas      9.378383
margaritas     9.039198
burrito        8.995167
tacos          8.817761
guacamole      8.679086
tortilla       7.375968
margarita      6.988893
chips          5.785699
beans          5.693767
authentic      2.966590
green          2.554212
rice           1.916101
fast           1.597518
dtype: float64

In [136]:
df = pd.DataFrame(word_prob_by_cuisine)
df = df.applymap(lambda x: 0 if math.isnan(x) else x)
df

Unnamed: 0,Asian Fusion,Japanese,Cafes,Seafood,Chinese,Mediterranean,Fast Food,French,American (New),Sushi Bars,Thai,Barbeque,Steakhouses,Mexican,Italian
appetizer,0.000000,0.0,0.000000,1.749961,0.000000,0.000000,0.000000,0.000000,1.425332,0.0,0.000000,0.0,0.0,0.000000,1.646645
asada,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,10.659338,0.000000
asian,6.352898,0.0,0.000000,0.000000,5.643795,0.000000,0.000000,0.000000,0.000000,0.0,3.452444,0.0,0.0,0.000000,0.000000
atmosphere,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.346902,0.0,0.000000,0.0,0.0,0.000000,0.000000
authentic,0.000000,0.0,0.000000,0.000000,2.860458,0.000000,0.000000,0.000000,0.000000,0.0,2.992105,0.0,0.0,2.966590,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wine,0.000000,0.0,0.000000,1.692548,0.000000,0.000000,0.000000,2.962192,1.608627,0.0,0.000000,0.0,0.0,0.000000,2.832224
wonderful,0.000000,0.0,0.000000,0.000000,0.000000,1.476369,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,1.483534
work,0.000000,0.0,1.495198,0.000000,0.000000,0.000000,1.637722,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000
wrong,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.817043,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000


In [138]:
df_t = df.T
df_t

Unnamed: 0,appetizer,asada,asian,atmosphere,authentic,ayce,bacon,basil,beans,beautiful,...,tortilla,tortillas,tuna,view,waiter,wine,wonderful,work,wrong,yellow
Asian Fusion,0.0,0.0,6.352898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.57419,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Japanese,0.0,0.0,0.0,0.0,0.0,9.221005,0.0,0.0,0.0,0.0,...,0.0,0.0,5.577363,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cafes,0.0,0.0,0.0,0.0,0.0,0.0,2.073344,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.495198,0.0,0.0
Seafood,1.749961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.667978,1.692548,0.0,0.0,0.0,0.0
Chinese,0.0,0.0,5.643795,0.0,2.860458,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mediterranean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.476369,0.0,0.0,0.0
Fast Food,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.637722,1.817043,0.0
French,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.761837,...,0.0,0.0,0.0,7.609901,0.0,2.962192,0.0,0.0,0.0,0.0
American (New),1.425332,0.0,0.0,1.346902,0.0,0.0,1.548749,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.608627,0.0,0.0,0.0,0.0
Sushi Bars,0.0,0.0,0.0,0.0,0.0,14.844523,0.0,0.0,0.0,0.0,...,0.0,0.0,6.814647,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
sim_matrix = df_t.dot(df)
sim_matrix

Unnamed: 0,Asian Fusion,Japanese,Cafes,Seafood,Chinese,Mediterranean,Fast Food,French,American (New),Sushi Bars,Thai,Barbeque,Steakhouses,Mexican,Italian
Asian Fusion,437.301543,78.025834,0.0,6.241891,179.110775,4.245693,0.0,0.0,2.200825,95.283338,97.590449,17.682139,0.0,5.353411,2.44892
Japanese,78.025834,1355.356318,0.0,9.434196,34.040413,0.0,0.0,0.0,0.0,1137.136238,19.955982,0.0,0.0,0.0,0.0
Cafes,0.0,0.0,268.123433,0.0,0.0,0.0,5.782179,31.153549,3.21109,0.0,0.0,6.176061,0.0,0.0,0.0
Seafood,6.241891,9.434196,0.0,257.15862,9.375725,0.0,0.0,26.330418,9.172054,12.114159,0.0,0.0,77.874508,0.0,16.395422
Chinese,179.110775,34.040413,0.0,9.375725,550.539918,6.057276,0.0,0.0,0.0,33.299152,200.035016,25.420642,0.0,16.123448,3.163924
Mediterranean,4.245693,0.0,0.0,0.0,6.057276,5252.934042,8.107938,0.0,2.624017,3.109648,6.270162,5.575972,0.0,2.911754,9.291808
Fast Food,0.0,0.0,5.782179,0.0,0.0,8.107938,981.972618,0.0,6.506736,0.0,0.0,3.382149,0.0,9.661602,0.0
French,0.0,0.0,31.153549,26.330418,0.0,0.0,0.0,2186.714289,8.993193,0.0,0.0,0.0,75.879186,0.0,8.389592
American (New),2.200825,0.0,3.21109,9.172054,0.0,2.624017,6.506736,8.993193,43.035569,7.212331,0.0,0.0,0.0,0.0,10.655624
Sushi Bars,95.283338,1137.136238,0.0,12.114159,33.299152,3.109648,0.0,0.0,7.212331,1383.548222,20.706764,3.848922,0.0,3.920968,0.0


Next lets try and compare cuisines based on other tags commonly occurring with the cuisine tags