# Yelp Data

https://towardsdatascience.com/converting-yelp-dataset-to-csv-using-pandas-2a4c8f03bd88

In [None]:
import numpy as np
import pandas as pd
import json

In [None]:
business_json_path = './data/yelp_academic_dataset_business.json'
review_json_path = 'data/yelp_academic_dataset_review.json'

In [None]:
df_b = pd.read_json(business_json_path, lines=True)

In [None]:
df_b.shape

In [None]:
df_b.head()

In [None]:
# We only use the first 100,000 data in this assignment
users = []
with open(review_json_path) as fl:
    for i, line in enumerate(fl):
        users.append(json.loads(line))
        if i+1 >= 100000:
            break
df_rev = pd.DataFrame(users)

In [None]:
df_rev.shape

In [None]:
df_rev.head()

In [None]:
df_merged = pd.merge(df_b, df_rev, on='business_id', how='inner')

In [None]:
df_merged.shape

In [None]:
df_merged.head()

In [None]:
df_merged.to_csv('yelp.csv')

In [None]:
# 1 = open, 0 = closed
df_b = df_b[df_b['is_open']==1]

In [None]:
drop_columns = ['hours','is_open','review_count']
df_b = df_b.drop(drop_columns, axis=1)

In [None]:
business_RV = df_b[df_b['categories'].str.contains(
              'RV Repair|RV Dealers|RV Rental|RV Parks|Campgrounds',
              case=False, na=False)]

In [None]:
df_explode = df_b.assign(categories = df_b.categories
                         .str.split(', ')).explode('categories')

In [None]:
df_explode.categories.value_counts()[:10]

In [None]:
df_explode[df_explode.categories.str.contains('RV',
                      case=True,na=False)].categories.value_counts()

In [None]:
size = 1000000
review = pd.read_json(review_json_path, lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int}, chunksize=size)

In [None]:
# There are multiple chunks to be read
chunk_list = []
for chunk_review in review:
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['review_id','useful','funny','cool'], axis=1)
    # Renaming column name to avoid conflict with business overall star rating
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    # Inner merge with edited business file so only reviews related to the business remain
    chunk_merged = pd.merge(business_RV, chunk_review, on='business_id', how='inner')
    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [None]:
df_rev.shape