## Notebook is aiming to create a list of restaurants that have 5 or more reviews
## Notebook also labels individual reviews that are very low compared to the restaurant's average

In [1]:
#import a bunch of libraries at the very start of the project, will modify as things go on
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re
from collections import Counter
from nltk import ngrams
from wordcloud import WordCloud
import matplotlib as mpl
import matplotlib.pyplot as plt
import spacy
from spacy import displacy
import json

%matplotlib inline

loader = False

analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\trevo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
path_business = r"D:\yelp_data\yelp_academic_dataset_business.json"
# business_df = pd.read_json(path_business)
# business_df.head()

In [3]:
#Load in the Yelp overview of the businesses in the dataset
data_business = open(path_business)
read_data_business = data_business.read()
business_df = pd.read_json(read_data_business, lines = True)
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,45.511907,-122.613693,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Antiques, Fashion, Used, Vintage & Consignment...","{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0..."
3,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,28.914482,-81.295979,3.0,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Beauty & Spas, Hair Salons",
4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,1,"{'GoodForKids': 'False', 'BusinessParking': '{...","Gyms, Active Life, Interval Training Gyms, Fit...","{'Monday': '16:0-19:0', 'Tuesday': '16:0-19:0'..."


In [4]:
#Create a column to indicate whether or not a business is a restaurant
#business_df['restaurant_marker'] = business_df.apply(lambda x: 1 if 'Restaurants' in x['categories']  else 0,axis = 1)
business_df.dropna(inplace=True)
business_df['restaurant_marker'] = business_df.apply(lambda x: 1 if ("Restaurants" in x['categories'])  else 0,axis = 1)
business_df = business_df[business_df['restaurant_marker'] == 1]
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,restaurant_marker
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",1
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",1
5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",1
12,HPA_qyMEddpAEtFof02ixg,Mr G's Pizza & Subs,474 Lowell St,Peabody,MA,01960,42.541155,-70.973438,4.0,39,1,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","Food, Pizza, Restaurants","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",1
13,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,Orlando,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",1


In [5]:
#Create a set of those businesses that are restaurants
business_list = set(business_df.business_id.unique())
print(business_df.shape)

(42646, 15)


In [6]:
path_review = r"D:\yelp_data\yelp_academic_dataset_review.json"
column_names = {"user_id":'',"stars":int(),"text":'',"business_id":''}
review_df = pd.DataFrame(column_names, index = [])
review_df["stars"] = review_df["stars"].astype("int8")


In [7]:
#Load in the reviews dataset, identifying whether or not a review is for a restaurant
review_holder = pd.read_json(path_review,lines = True, orient ="records",chunksize=100000) #, nrows = 4000000
for i in review_holder:
    i = i[["user_id","stars","text","business_id"]]
    i.dropna(inplace=True)
    i['restaurant_marker'] = business_df.apply(lambda x: 1 if (x['business_id'] in business_list)  else 0,axis = 1)
    i = i[i['restaurant_marker'] == 1]
    i["stars"] = i["stars"].astype("int8")
    review_df = review_df.append(i, ignore_index = True)
review_df.shape

(42646, 5)

In [8]:
review_df.head()

Unnamed: 0,user_id,stars,text,business_id,restaurant_marker
0,ak0TdVmGKo4pwqdJSTLwWw,4,Apparently Prides Osteria had a rough summer a...,buF9druCkbuXLX526sGELQ,1.0
1,YoVfDbnISlW0f7abNQACIg,4,This store is pretty good. Not as great as Wal...,RA4V8pr014UyUbDvI-LW2A,1.0
2,RNm_RWkcd02Li2mKPRe7Eg,1,"This place used to be a cool, chill place. Now...",xGXzsc-hzam-VArK6eTvtw,1.0
3,hn0ZbitvmlHnF--KJGJ6_A,4,I have been here twice and have had really goo...,TA1KUSCu8GkWP9w0rmElxw,1.0
4,B7YSV6r1ePAXc69FkDDuZw,1,I wish I could give them zero stars. The call ...,wZgUAuDuEGPEzKK-PsngKQ,1.0


In [9]:
#Count the number of reviews left for each restaurant, limit to those with more than 5 reviews
counts = review_df["business_id"].value_counts()
counts = counts[counts > 5]
counts

bZiIIUcpgxh8mpKMDhdqbA    327
H_RM2u1WWGU1HkKZrYq2Ow    230
jREzLrIEkc4jQKLfYMJ0gg    180
oz882XuZCxajKo64Opgq_Q    153
VPqWLp9kMiZEbctCebIZUA    148
                         ... 
AswNIkgOxjuPXLiqum7kMg      6
_8Hejg5Q-_izIhLvq2ocnw      6
LAhRM37ofCq5f4nCM20aCA      6
SGCyt7yM6ge2Ejz4WjC-aQ      6
G-FI19sD78LEJXppn9FMDw      6
Name: business_id, Length: 1839, dtype: int64

In [10]:
review_df['five_plus'] = review_df.apply(lambda x: 1 if (x['business_id'] in counts.index)  else 0,axis = 1)

In [11]:
#Keep only those restaurants with 5 or more reviews
review_df = review_df[review_df['five_plus'] == 1]
review_df.drop(['restaurant_marker', 'five_plus'],axis = 1, inplace = True)


In [12]:
#Merge the reviews with the business DF
review_df = pd.merge(review_df,business_df, on = 'business_id')
#review_df.head()
#business_df.head()

In [13]:
#Drop un-needed columns
review_df.drop(['address', 'city','state','postal_code','latitude','longitude','is_open','attributes','categories','hours','restaurant_marker'],axis = 1, inplace = True)
review_df.head()

Unnamed: 0,user_id,stars_x,text,business_id,name,stars_y,review_count
0,ak0TdVmGKo4pwqdJSTLwWw,4,Apparently Prides Osteria had a rough summer a...,buF9druCkbuXLX526sGELQ,Prides Osteria,3.5,83
1,xUCX4GhBpeWxZB0l2lmt_w,5,This is as close to dining in Italy as you'll ...,buF9druCkbuXLX526sGELQ,Prides Osteria,3.5,83
2,Kgz8xohZN1r2e-qN5GQ2pQ,5,A gem tucked away in Beverly. The atmosphere i...,buF9druCkbuXLX526sGELQ,Prides Osteria,3.5,83
3,jpiv6qQOv6FD8NbLfLFxTA,5,"The food is excellent, although the wait for a...",buF9druCkbuXLX526sGELQ,Prides Osteria,3.5,83
4,7NuAk35FPwEZ1HwVIV5vsw,2,Meh. Just wasn't comfortable with the minimali...,buF9druCkbuXLX526sGELQ,Prides Osteria,3.5,83


In [14]:
#Save this review DF
review_df.to_feather('formatted_reviews.ftr')

In [15]:
second_df = pd.read_feather('formatted_reviews.ftr')

In [16]:
#Label those reviews that are 3 or more stars less than the restaurant's average
second_df['label'] = second_df.apply(lambda x: 1 if x['stars_y'] - x['stars_x'] >= 3 else 0,  axis = 1)
second_df.head()
second_df.shape

(23826, 8)

In [17]:
second_df.tail()

Unnamed: 0,user_id,stars_x,text,business_id,name,stars_y,review_count,label
23821,i0jnz-ODkqqqmcxU-166nw,5,I went here today at 6pm not expecting many pi...,06T7cI-8vpENSHmPfm3H8w,The Pie Hole,4.0,109,0
23822,4AFrxm9c1j20ybQ9-stx6w,1,"I really, really, really wanted to like this p...",06T7cI-8vpENSHmPfm3H8w,The Pie Hole,4.0,109,1
23823,0YCQvW-XjgHiV_nNymlC7A,3,Bought a 9 inch raspberry cream pie.\n\nIt was...,06T7cI-8vpENSHmPfm3H8w,The Pie Hole,4.0,109,0
23824,V2OZsbSi9zgTAKpjY5bXSQ,2,"Being a Burnaby resident, I was so excited to ...",06T7cI-8vpENSHmPfm3H8w,The Pie Hole,4.0,109,0
23825,jIZNiaRnLYCCqn-PA2b7lA,5,I have been eating these pies for about 2 year...,06T7cI-8vpENSHmPfm3H8w,The Pie Hole,4.0,109,0


In [18]:
second_df['label'].value_counts()

0    23134
1      692
Name: label, dtype: int64