In [91]:
import json
import pandas as pd
import nltk
import boto3
import numpy as np
import seaborn as sns

In [16]:
dataset_folder = "yelp_dataset_challenge_academic_dataset"
business_datafile = "yelp_academic_dataset_business.json"
review_datafile = "yelp_academic_dataset_review.json"
user_datafile = "yelp_academic_dataset_user.json"

Reading in the buisness data

In [3]:
business_data = pd.read_json(f"../{dataset_folder}/{business_datafile}", lines=True, typ="frame", orient="columns")
business_data.drop(["hours", "open", "full_address", "neighborhoods", "longitude", "latitude", "attributes", "type", "review_count"], axis=1, inplace=True)
business_data.set_index("business_id", inplace=True, verify_integrity=True)
business_data["categories"] = business_data["categories"].apply(lambda x: set(x))

Filtering out Chinese restaurants

In [4]:
ch_rest_data = business_data[business_data["categories"].map(lambda x: "Restaurants" in x and "Chinese" in x)]

In [7]:
ch_rest_data

Unnamed: 0_level_0,categories,city,name,state,stars
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RgDg-k9S5YD_BaxMckifkg,"{Restaurants, Chinese}",De Forest,Chang Jiang Chinese Kitchen,WI,4.0
UgjVZTSOaYoEvws_lAP_Dw,"{Restaurants, Chinese}",Mc Farland,Main Moon Chinese Restaurant,WI,3.5
MKsb2VpLB-0UBODcInDsSw,"{Restaurants, Chinese}",Middleton,China Wok Buffet,WI,2.5
KW6HejC-67KSL9J8Cz1dSw,"{Restaurants, Chinese}",Middleton,Chin's Asia Fresh,WI,3.0
wl-7A4jC0f27MOEmW-XTbQ,"{Restaurants, Chinese}",Middleton,Grand China Restaurant,WI,3.0
...,...,...,...,...,...
MJLe4pP-0ZOFHuFt0CIoRw,"{Restaurants, Chinese}",Las Vegas,A & K Chinese Restaurant,NV,4.0
8T4aPvtpugzUxJc-DVrLHg,"{Restaurants, Chinese}",Las Vegas,Dumpling King,NV,4.5
4Db74Jsyp81bei99fo-3Xg,"{Mongolian, Restaurants, Chinese, Buffets}",Madison,World Buffet,WI,2.5
kiA-L7r9uHh2bqUPxuCzNQ,"{Restaurants, Thai, Asian Fusion, Chinese}",Las Vegas,Oriental House,NV,4.5


Fetching the reviews on the filtered restaurants

In [5]:
review_data = pd.read_json(f"../{dataset_folder}/{review_datafile}", lines=True, typ="frame", orient="columns")
review_data.drop(["date", "type"], axis=1, inplace=True)
review_data.set_index("review_id", inplace=True)

In [11]:
complete_data = review_data.join(ch_rest_data, how="inner", on="business_id", lsuffix="_rev", rsuffix="_res")

Calculating the total votes received by the review

In [9]:
complete_data["tot_votes"] = complete_data["votes"].map(lambda obj: obj["funny"] + obj["useful"] + obj["cool"])
complete_data.drop("votes", inplace=True, axis=1)

Converting the reviews texts into sentences so we can find the ones containing our dishes

In [13]:
def sent_tokenise_reviews(review_txt):
    regex_tk = nltk.tokenize.RegexpTokenizer(pattern=r"[.?!;,\n]+" ,gaps=True, discard_empty=True)
    sentences = [s.strip() for s in regex_tk.tokenize(review_txt)]
    
    return sentences

In [14]:
complete_data["review"] = complete_data["text"].map(sent_tokenise_reviews)
complete_data.drop("text", axis=1, inplace=True)

We also want to factor in the popularity of the user in our score for the review.

In [19]:
user_data = pd.read_json(f"../{dataset_folder}/{user_datafile}", lines=True, typ="frame", orient="columns")
user_data.drop(["yelping_since", "votes", "average_stars", "type", "compliments", "elite", "name"], axis=1, inplace=True)
user_data.set_index("user_id", inplace=True)

Joining this data with our review and retaurant table

In [22]:
complete_data = complete_data.join(user_data, how="inner", on="user_id")

ValueError: columns overlap but no suffix specified: Index(['review_count', 'friends', 'fans'], dtype='object')

This is what our final processed dataset looks like

In [23]:
complete_data

Unnamed: 0_level_0,user_id,stars_rev,business_id,tot_votes,categories,city,name,state,stars_res,review,review_count,friends,fans
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ZYaS2P5EmK9DANxGTV48Tw,u5xcw6LCnnMhddoxkRIgUA,5,RgDg-k9S5YD_BaxMckifkg,0,"{Restaurants, Chinese}",De Forest,Chang Jiang Chinese Kitchen,WI,4.0,[I really like both Chinese restaurants in tow...,21,"[Xiprzd4TvJcZnVHGPdDh1A, vNMHIyT76krWH3LXevQ96A]",1
uOLM0vvnFdp468ofLnszTA,kj18hvJRPLepZPNL7ySKpg,3,RgDg-k9S5YD_BaxMckifkg,0,"{Restaurants, Chinese}",De Forest,Chang Jiang Chinese Kitchen,WI,4.0,"[Above average takeout with friendly staff, Th...",75,"[3T66OaLFT0xAnqW4IGNwrQ, dFSg9ZUwdvXO1c1ZACgZz...",3
8EjIXDrV5QjmjU52PePGkw,kj18hvJRPLepZPNL7ySKpg,2,unXHmjewHGeWQwJr74Sy9g,0,"{Restaurants, Chinese}",Madison,China Inn,WI,3.0,[Uninspired Chinese food from very nice people...,75,"[3T66OaLFT0xAnqW4IGNwrQ, dFSg9ZUwdvXO1c1ZACgZz...",3
v4jw2p9EdIIU5EyNUR7eJA,kj18hvJRPLepZPNL7ySKpg,4,unXHmjewHGeWQwJr74Sy9g,1,"{Restaurants, Chinese}",Madison,China Inn,WI,3.0,[China Inn seems to have improved since I last...,75,"[3T66OaLFT0xAnqW4IGNwrQ, dFSg9ZUwdvXO1c1ZACgZz...",3
1yaTswITBDOQXCttxw7lwA,kj18hvJRPLepZPNL7ySKpg,4,DR4xcr3eJ0vfYRomvx036w,1,"{Restaurants, Chinese}",Madison,Wah Kee Wonton Noodle,WI,3.5,"[This is where we go for comfort food, As you ...",75,"[3T66OaLFT0xAnqW4IGNwrQ, dFSg9ZUwdvXO1c1ZACgZz...",3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
AX5EBWZQ8hnv-EAXb6LaGQ,yKXsbgzpI3710TNuXZUdmQ,5,kiA-L7r9uHh2bqUPxuCzNQ,1,"{Restaurants, Thai, Asian Fusion, Chinese}",Las Vegas,Oriental House,NV,4.5,"[My girlfriend came here with her mom, and tho...",1,[WYLynFCS994Vsb8T6OE7FA],0
MGZ5hFzc0zx0doMNZ77FOw,nVLjR4wlwRqy2pDevDyczw,5,kiA-L7r9uHh2bqUPxuCzNQ,1,"{Restaurants, Thai, Asian Fusion, Chinese}",Las Vegas,Oriental House,NV,4.5,[I have been craving pad Thai lately and tryin...,1,[],0
5O8hdZBAxKqxaJwzr2baTw,C0wuzY2sMy2wFw5AKNf0Tw,4,kiA-L7r9uHh2bqUPxuCzNQ,2,"{Restaurants, Thai, Asian Fusion, Chinese}",Las Vegas,Oriental House,NV,4.5,[We VegaS-ites are lucky and if you are anywhe...,1,[],0
GCNMuWjMTPuscr2NmZ947Q,pMXXX6AbUXgivkju9S7YDQ,5,kiA-L7r9uHh2bqUPxuCzNQ,3,"{Restaurants, Thai, Asian Fusion, Chinese}",Las Vegas,Oriental House,NV,4.5,[Stopped by on 7-12-2014 by chance after simpl...,25,[],0


Now we'll load the list of chinese dish names that were extracted in Task 3

In [28]:
dishes = pd.read_csv("chinese_phrases.csv")
dishes

Unnamed: 0,Dish,confidence
0,dim sum,1.000000
1,sweet and sour sauce,0.999881
2,fried rice,0.999524
3,sesame oil,0.999523
4,el topo,0.999448
...,...,...
112947,a and,0.000000
112948,a a,0.000000
112949,a 's,0.000000
112950,'s review,0.000000


This list is quite long. Since its already sorted by confidence lets select the top 50 dishes

In [30]:
dishes = dishes.head(50)


Now we need to find all the sentences in the reviews that contains the names of these dishes and run them through a sentiment analyser. For this excercise I decided to the __AWS Comprehend__ service that provides a sentiment analyser out of the box. We'll calculate the review sentiment for each dish, restaurant pair so that we can figure out which dish is good in which restaurant

Fetching the sentences for each review which contains the dishes we are interested in. Writing these out to a file since its a long operation and I can easily load a data from disk when I need it

In [45]:
with open("./output/dish_reviews.csv", "w", encoding="utf-8") as f:
    f.write("dish,review_id,business_id,sentences\n")
    for dish in dishes["Dish"]:
        for idx, row in complete_data.iterrows():
            rel_sentences = ""
            for sentence in row["review"]:
                if sentence.find(dish) > -1:
                    rel_sentences = rel_sentences + sentence + "|"
            if rel_sentences != "":
                f.write(dish + ',' + idx + ',' + row['business_id'] + ',' +rel_sentences + '\n')

Reading the data we wrote back into a dataframe

In [49]:
dish_and_reviews = pd.read_csv("./output/dish_reviews.csv")
dish_and_reviews["sentences"] = dish_and_reviews["sentences"].map(lambda x: x[:-1].split('|'))

In [55]:
dish_and_reviews

Unnamed: 0,dish,review_id,business_id,sentences
0,dim sum,5f5Ep2WJ-CeNGXCDsq5VEQ,MvWxzyY3zIFH1fZL53E1yw,[Madison has a place that serves up great dim ...
1,dim sum,Bgm7vT6X9PFBxzYHVvfNdQ,jbKVbSz51F8IcewsiRQu4A,"[sea bass) but I've only been for dim sum, One..."
2,dim sum,pSNvA6hVY0n994nOH0EaNg,9Qt1pt0pk2VWz0chdGk-jw,"[*Review for dim sum only, part a la carte dim..."
3,dim sum,_njEQDYNFeS6fiyFJX4C7w,9Qt1pt0pk2VWz0chdGk-jw,[says Phoenix Palace acquired a master dim sum...
4,dim sum,JQ5-oEFiAiWAEwiXeN3VRg,yOYFhiTjT-SM4spKtDk92w,[Ordering cooked-to-order dim sum off of a men...
...,...,...,...,...
17536,napa cabbage,i21Wt5HTbHk-z0oVpNaKgg,jgWAgT7LWPTf5wju9Q-kiw,[- Steamed dumplings with pork and napa cabbage]
17537,napa cabbage,1aMB8TMkf5EpnQ6NApClxQ,jgWAgT7LWPTf5wju9Q-kiw,[Tried ones with pork and napa cabbage and ste...
17538,napa cabbage,7Ipe2TNwTl6gkcHHV3vdBA,8HCC8NuJmdHnEg2XC00NPg,[napa cabbage]
17539,napa cabbage,PTlhdVvULEapTD46DfSGLw,2XmS09dl3GT_WknqS9hTVA,[napa cabbage]


Now that we have the sentences split properly, we'll call the AWS comprehend APIs to give us the sentiment for each dish and review combination. Since the API returns 4 scores for each line, one for each type of sentiment(positive, negative, mixed, neutral) and we want a single metric to be able to compare and sort the dishes, I decided to take a weighted sum of the salient score for each line. Mixed scores were ignored since the sentiment was probably not obvious. Positive scores are added whereas negative scores are subtracted. Neutral scores were scaled down and added. The final score that will be generated will therefore be higher if its a positive review and lower if negative. These scores can then be used to sort the dishes.

In [64]:
comprehend = boto3.client(service_name='comprehend', region_name='ap-southeast-1')

def get_sentiment_score_aws(sentence_list):
    score = 0
    for sentence in sentence_list:
        result = comprehend.detect_sentiment(Text=sentence, LanguageCode='en')
        sentiment_info = result["SentimentScore"]
        if result["Sentiment"] == "POSITIVE":
            score = score + sentiment_info["Positive"]
        elif result["Sentiment"] == "NEGATIVE":
            score = score + sentiment_info["Negative"] * -1
        elif result["Sentiment"] == "NEUTRAL":
            score = score + sentiment_info["Neutral"] * 0.3
        else:
            score = score + 0
            #ignore the cases where the sentiment is mixed
        
    return score

In [65]:
dish_and_reviews["sentiment_score"] = dish_and_reviews["sentences"].map(get_sentiment_score_aws)

In [66]:
dish_and_reviews

Unnamed: 0,dish,review_id,business_id,sentences,sentiment_score
0,dim sum,5f5Ep2WJ-CeNGXCDsq5VEQ,MvWxzyY3zIFH1fZL53E1yw,[Madison has a place that serves up great dim ...,0.259460
1,dim sum,Bgm7vT6X9PFBxzYHVvfNdQ,jbKVbSz51F8IcewsiRQu4A,"[sea bass) but I've only been for dim sum, One...",0.458380
2,dim sum,pSNvA6hVY0n994nOH0EaNg,9Qt1pt0pk2VWz0chdGk-jw,"[*Review for dim sum only, part a la carte dim...",0.807433
3,dim sum,_njEQDYNFeS6fiyFJX4C7w,9Qt1pt0pk2VWz0chdGk-jw,[says Phoenix Palace acquired a master dim sum...,0.995324
4,dim sum,JQ5-oEFiAiWAEwiXeN3VRg,yOYFhiTjT-SM4spKtDk92w,[Ordering cooked-to-order dim sum off of a men...,0.786310
...,...,...,...,...,...
17536,napa cabbage,i21Wt5HTbHk-z0oVpNaKgg,jgWAgT7LWPTf5wju9Q-kiw,[- Steamed dumplings with pork and napa cabbage],0.297442
17537,napa cabbage,1aMB8TMkf5EpnQ6NApClxQ,jgWAgT7LWPTf5wju9Q-kiw,[Tried ones with pork and napa cabbage and ste...,0.252151
17538,napa cabbage,7Ipe2TNwTl6gkcHHV3vdBA,8HCC8NuJmdHnEg2XC00NPg,[napa cabbage],0.297876
17539,napa cabbage,PTlhdVvULEapTD46DfSGLw,2XmS09dl3GT_WknqS9hTVA,[napa cabbage],0.297876


The now that we have the scores from the reviews we also would like to incorporate the information of the user who provided the review and also the upvotes on the review. The intuition here is that users who have a large number of followers are probably food connoisseurs who are regular users of the platform and hence their reviews should be given more weightage. Also, reviews which have garnered a lot of upvotes are also likely of high quality. In order to bring this information in we'll first have to join with our *complete_data* table that has that information

In [71]:
reviews_with_scores = complete_data.merge(dish_and_reviews, how="inner", on="review_id")
reviews_with_scores

Unnamed: 0,review_id,user_id,stars_rev,business_id_x,tot_votes,categories,city,name,state,stars_res,review,review_count,friends,fans,dish,business_id_y,sentences,sentiment_score
0,ZYaS2P5EmK9DANxGTV48Tw,u5xcw6LCnnMhddoxkRIgUA,5,RgDg-k9S5YD_BaxMckifkg,0,"{Restaurants, Chinese}",De Forest,Chang Jiang Chinese Kitchen,WI,4.0,[I really like both Chinese restaurants in tow...,21,"[Xiprzd4TvJcZnVHGPdDh1A, vNMHIyT76krWH3LXevQ96A]",1,crab rangoon,RgDg-k9S5YD_BaxMckifkg,[This one has outstanding crab rangoon],0.992379
1,nFPe7kazeEVjPP7zgoK4Og,iOPfu26Siat3sx-8SM63OQ,5,UgjVZTSOaYoEvws_lAP_Dw,1,"{Restaurants, Chinese}",Mc Farland,Main Moon Chinese Restaurant,WI,3.5,"[Main Moon has the best lunch in McFarland, bu...",3,[],0,fried rice,UgjVZTSOaYoEvws_lAP_Dw,[Their fried rice is legit],0.562663
2,_Ev8oZvNjOeZLpWlu66QDg,hJbhbqg-x3TFzhvqqwl5RA,4,MKsb2VpLB-0UBODcInDsSw,1,"{Restaurants, Chinese}",Middleton,China Wok Buffet,WI,2.5,[The china wok buffet is the best Chinese buff...,6,"[tBctER6R_riVNVBAQBCOYA, tF1qQ5Ik8tb1xtXcU-eW_A]",0,deep fried,MKsb2VpLB-0UBODcInDsSw,[deep fried shrimp],0.268454
3,_Ev8oZvNjOeZLpWlu66QDg,hJbhbqg-x3TFzhvqqwl5RA,4,MKsb2VpLB-0UBODcInDsSw,1,"{Restaurants, Chinese}",Middleton,China Wok Buffet,WI,2.5,[The china wok buffet is the best Chinese buff...,6,"[tBctER6R_riVNVBAQBCOYA, tF1qQ5Ik8tb1xtXcU-eW_A]",0,egg roll,MKsb2VpLB-0UBODcInDsSw,[egg rolls],0.212986
4,pV8vej4lot9N8wSRCBE7Yw,R04P35fYVS4HASyBuMcElg,4,yXBo7Wgos9B-KzkYSMrNJw,9,"{Restaurants, Chinese}",Madison,DumplingHaus,WI,3.5,[The Wife and I decided to try DumplingHaus at...,114,"[k71n8bELfnhMeC63xhUxzw, TAOoRCZFOyySXWfOSaLQi...",7,green onion,yXBo7Wgos9B-KzkYSMrNJw,[green onions],0.297165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17536,UYkuV1_oxMCWImvNofyfgA,r1cO2JxQr8heDbkwNBnmIw,4,MJLe4pP-0ZOFHuFt0CIoRw,4,"{Restaurants, Chinese}",Las Vegas,A & K Chinese Restaurant,NV,4.0,"[Simply, some of the best Chinese food in Las ...",9,"[_y7Hgs3-uT4uWGMRvCTExg, Kx4v-rKmNLZB0xdwfoAsS...",0,fried rice,MJLe4pP-0ZOFHuFt0CIoRw,[We also had fried rice and a crispy noodle wi...,0.253102
17537,ANiVL-pZQuC8Pl1KtOwCpA,n7SG66zQG_w-0y2W4jURIg,5,kiA-L7r9uHh2bqUPxuCzNQ,6,"{Restaurants, Thai, Asian Fusion, Chinese}",Las Vegas,Oriental House,NV,4.5,[I decided to give the Oriental House a try be...,64,"[5663s_8d6zqCXh4CS8a0LA, 67SeYDp8jKWsg2desVQk1...",1,fried rice,kiA-L7r9uHh2bqUPxuCzNQ,[Hawaiian fried rice],0.297853
17538,ANiVL-pZQuC8Pl1KtOwCpA,n7SG66zQG_w-0y2W4jURIg,5,kiA-L7r9uHh2bqUPxuCzNQ,6,"{Restaurants, Thai, Asian Fusion, Chinese}",Las Vegas,Oriental House,NV,4.5,[I decided to give the Oriental House a try be...,64,"[5663s_8d6zqCXh4CS8a0LA, 67SeYDp8jKWsg2desVQk1...",1,noodle soup,kiA-L7r9uHh2bqUPxuCzNQ,[Everything from Lemon Grass Chicken to Chaba ...,0.290666
17539,eY0JzbLw-v8FFqVJ59IF5Q,Dr5hwEis0kRqHSZyNd00Kg,4,kiA-L7r9uHh2bqUPxuCzNQ,7,"{Restaurants, Thai, Asian Fusion, Chinese}",Las Vegas,Oriental House,NV,4.5,[My friends and I decided to try this place ou...,3,[],0,iced tea,kiA-L7r9uHh2bqUPxuCzNQ,[and was even so generous enough to give us fr...,0.991410


Now we want to scale the sentiment scores by the log(upvotes + fans). HAd to add the 2 to ensure that if the sum of fans and tot_votes is zero then the score won't change

In [76]:
reviews_with_scores["sentiment_score"] = reviews_with_scores["sentiment_score"] * np.log2(2 + reviews_with_scores["fans"] + reviews_with_scores["tot_votes"])
reviews_with_scores["sentiment_score"]

0        0.992379
1        0.562663
2        0.268454
3        0.212986
4        1.214653
           ...   
17536    0.587684
17537    0.893559
17538    0.871999
17539    2.974230
17540    0.574219
Name: sentiment_score, Length: 17541, dtype: float64

Now we are ready to analyse the sentiments. We can:
* __Group by dishes__ and figure out which dishes are the most popular for the cuisine.
* __Group by restaurants__ and extract the dishes that are most popular for a given restaurant
  
Once we have both the above we can recommend restaurants to people who desire to try a new cuisine based on which dishes are popular and where we coud have them

### Grouping by dishes

In [96]:
gb_dishes = reviews_with_scores.loc[:, ["dish", "sentiment_score"]].groupby("dish")
popular_dishes = gb_dishes.mean()
popular_dishes = popular_dishes.sort_values(by="sentiment_score", ascending=False)

In [101]:
popular_dishes.head(10)

Unnamed: 0_level_0,sentiment_score
dish,Unnamed: 1_level_1
filet mignon,2.25704
pork belly,1.864442
oyster sauce,1.759486
hot pot,1.599117
napa cabbage,1.411926
san gabriel,1.214227
sea bass,1.171565
iced tea,1.104736
peking duck,1.07482
papaya salad,1.05483


### Grouping by restaurant and dishes

In [100]:
gb_restaurants = reviews_with_scores.loc[:, ["name", "dish", "sentiment_score"]].groupby(["name", "dish"])
best_dishes_rest = gb_restaurants.mean()
best_dishes_rest

Unnamed: 0_level_0,Unnamed: 1_level_0,sentiment_score
name,dish,Unnamed: 2_level_1
1 Eastern Super Buffet,chicken wings,3.389887
1 Eastern Super Buffet,crab legs,-1.771453
1 Eastern Super Buffet,crab rangoon,0.982680
1 Eastern Super Buffet,dim sum,1.068423
1 Eastern Super Buffet,egg roll,-5.446799
...,...,...
Zushi Japanese Bistro,soy sauce,2.494965
iKitchen,fried rice,0.299289
iKitchen,green onion,-0.255580
iKitchen,noodle soup,0.490711
