# Preprocessing the Yelp dataset

In [72]:
import pandas as pd

In [81]:
business_json_path = '/Users/zhiyuwang/Desktop/Desktop/tmp/yelp_dataset/business.json'
review_json_path = '/Users/zhiyuwang/Desktop/Desktop/tmp/yelp_dataset/review.json'
df_b=pd.read_json(business_json_path, lines=True)

**Clean the business.json file**
* Include only current opened restaurants
* we only want to focus on the restaurants data

In [82]:
# 1= open, 0 = closed
df_b = df_b[df_b['is_open']==1]
df_b = df_b[df_b['categories'].str.contains('Restaurants|Food', case=False, na=False)]

In [83]:
# all columns in the business
for col in df_b.columns:
    print(col)

address
attributes
business_id
categories
city
hours
is_open
latitude
longitude
name
postal_code
review_count
stars
state


**Drop some irrelevant columns**
* As we are curiously about geo info of the restaurants, so hours, is_open, review_counts can be dropped.
* We will need to keep the business_id to merge with the reviews.json

In [84]:
drop_columns = ['hours','is_open','review_count']
df_b = df_b.drop(drop_columns, axis=1)

**not all cities have enough records, so we would mainly focus on the top 4 cities with most restaurants.**

In [85]:
df_b.city.value_counts()[:4]

Toronto      6847
Las Vegas    5621
Phoenix      3604
Montréal     3453
Name: city, dtype: int64

In [86]:
vegas_business= df_b[df_b['city'].str.contains('Las Vegas', case=False, na=False)]

In [87]:
size = 1000000
review = pd.read_json(review_json_path,lines = True, chunksize = size)                   

In [89]:
chunk_list = []
drop_columns = ['review_id', 'user_id','funny','cool']
for chunk_review in review:
    chunk_review = chunk_review.drop(['review_id','funny','cool'], axis=1)
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    chunk_merged = pd.merge(vegas_business, chunk_review, on='business_id', how='inner')
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
    
df = pd.concat(chunk_list, join='outer',axis=0)

173120 out of 1,000,000 related reviews
176413 out of 1,000,000 related reviews
174232 out of 1,000,000 related reviews
155655 out of 1,000,000 related reviews
180634 out of 1,000,000 related reviews
167825 out of 1,000,000 related reviews
124741 out of 1,000,000 related reviews


In [90]:
csv_name = "yelp_reviews_food_categories.csv"
df.to_csv(csv_name, index=False)