#### Data from [Yelp Dataset JSON](https://www.yelp.com/dataset)

In [1]:
# Import dependencies
import pandas as pd
import json

# Import functions
%run -i functions.py


In [2]:
# Create review dataframe
# Open json data file
data_file = open('../../Data/archive/yelp_academic_dataset_review.json', encoding = 'utf8')
data = []

# Read in data
for line in data_file:
    data.append(json.loads(line))

review_df = pd.DataFrame(data)
data_file.close()

In [3]:
# Create business dataframe
# Open json data file 
data_file = open('../../Data/archive/yelp_academic_dataset_business.json', encoding='utf8')
data = []

# Read in data
for line in data_file:
    data.append(json.loads(line))

business_df = pd.DataFrame(data)
data_file.close()

In [4]:
# review_df.head()
# business_df.head()

In [5]:
# review_df.columns
# business_df.columns

#### Transform Review Dataframe

In [6]:
# Drop reviews from before 1 June 2019 at midnight
review_df = review_df[review_df['date'] >= '2019-06-01 00:00:00']

In [7]:
# Sort reviews temporally
review_df = review_df.sort_values(by = 'date')

In [8]:
review_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
5220767,YNt_oiBLUSbmVgrEzgVhqQ,m6pbhQsplRN__dWGfSDiuw,wP2ok4O0GsR9td7Iiv1zKA,5.0,0,0,0,My friend and I were searching for a quick and...,2019-06-01 00:00:27
1878123,_AnR2n91AwsB0XPlkVNuFQ,qDmEz_StNWi9ZF17h3olRg,VQcCL9PiNL_wkGf-uF3fjg,5.0,0,0,0,This place was awesome! From the wait staff to...,2019-06-01 00:00:37
4166158,pO6s7ZVlCAGGRBQwPkTD5Q,vzymudSlj6Gpk_jjHrz1gA,DpgizymrlpkAc-dAnFBWFQ,5.0,1,0,2,I never write a review but wanted to make sure...,2019-06-01 00:00:40
3201198,l2Gx9_IttTrkIH370QdjEQ,YSW26aHwfMTy2KscDq-ODQ,gP_oWJykA2RocIs_GurKWQ,5.0,0,0,0,This place is so good. We had never been there...,2019-06-01 00:00:44
3896993,rq7PtoOMU2ELcsJtYLmjDw,9gZ4R3EHF__2S9gSRHdr3A,sKDqswbR_bwxLueSOHqqNA,1.0,0,0,0,Horrible service. We ordered at 6 pm through t...,2019-06-01 00:01:00


In [9]:
# Drop all positive reviews and reset index
bad_review_df = review_df[review_df['stars'] < 3.0].reset_index()

In [10]:
# Drop unnecessary columns
bad_review_df.drop(['review_id', 'user_id', 'useful', 'funny', 'cool'], axis = 1, inplace = True)

In [11]:
bad_review_df.iloc[[10000]]

Unnamed: 0,index,business_id,stars,text,date
10000,3240710,4ppN9-rsEyh-nkbDISeJcg,1.0,Everything went great with the oil change up u...,2019-06-15 20:30:02


In [12]:
# Take only the review coulumn
reviews = bad_review_df['text']


In [13]:
reviews[12]

'Their food is trash everything is bad. No fresh food. Last time I had made me sick. Will never go there and will not recommend anyone to eat there.'

#### Transform Business Data

In [14]:
# Drop unnecessary columns
business_df.drop(['stars', 'review_count', 'is_open', 'attributes', 'hours'], axis = 1, inplace = True)

In [15]:
business_df.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,categories
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,"Doctors, Traditional Chinese Medicine, Naturop..."
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,"Shipping Centers, Local Services, Notaries, Ma..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,"Department Stores, Shopping, Fashion, Home & G..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,"Brewpubs, Breweries, Food"


In [16]:
# Split categories column by delimiter and add first column to dataframe
business_df['categories'] = business_df['categories'].str.split(',', expand = True)[0]

In [17]:
# Filter out rows with less than 5 category instances
# business_df = business_df.groupby('categories').filter(lambda x : len(x)>4)

In [22]:
# Filter out all categories except for 'Restaurants' and 'Food'
# Note: I probably filtered out too many businesses, but this works for 
# an inital pass through
business_df = business_df[(business_df['categories'] == 'Restaurants') | (business_df['categories'] == 'Food')]

In [23]:
# Check filtering
business_df['categories'].value_counts()

Restaurants    15290
Food            6783
Name: categories, dtype: int64

In [24]:
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,categories
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,Restaurants
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,Food
23,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,5505 S Virginia St,Reno,NV,89502,39.476117,-119.789339,Restaurants
27,tMkwHmWFUEXrC9ZduonpTg,The Green Pheasant,215 1st Ave S,Nashville,TN,37201,36.159886,-86.773197,Restaurants
36,2xVsWBNFwZOxIOdd9Mwnww,Cheeseburger In Paradise,116 N Pottstown Pike,Exton,PA,19341,40.029962,-75.630607,Restaurants


In [None]:
# Connect businesses to reviews through business_id