# Capstone III - Loading Yelp Datasets

In [1]:
import pandas as pd

# Load Data

### Load Yelp Datasets from JSON to Pandas Dataframes

#### Load Businesses dataset

In [20]:
business_json_path = 'yelp_dataset/yelp_academic_dataset_business.json'
df_bsn = pd.read_json(business_json_path, lines=True)

#### Load Review dataset

In [26]:
review_json_path =  'yelp_dataset/yelp_academic_dataset_review.json'

In [27]:
# Set chunk size to 1,000,000 lines:
size = 1000000

# Setting up column data types, using Yelp documentation:
review = pd.read_json(review_json_path, lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

# Convert Data

In [21]:
df_bsn.head().T

Unnamed: 0,0,1,2,3,4
business_id,6iYb2HFDywm3zjuRg0shjw,tCbdrRPZA0oiIYSmHG3J0w,bvN78flM8NLprQ1a1y5dRg,oaepsyvc0J17qwi8cfrOWg,PE9uqAjdw0E4-8mjGl3wVA
name,Oskar Blues Taproom,Flying Elephants at PDX,The Reclaimory,Great Clips,Crossfit Terminus
address,921 Pearl St,7000 NE Airport Way,4720 Hawthorne Ave,2566 Enterprise Rd,1046 Memorial Dr SE
city,Boulder,Portland,Portland,Orange City,Atlanta
state,CO,OR,OR,FL,GA
postal_code,80302,97218,97214,32763,30316
latitude,40.0175,45.5889,45.5119,28.9145,33.747
longitude,-105.283,-122.593,-122.614,-81.296,-84.3534
stars,4,4,4.5,3,4
review_count,86,126,13,8,14


In [25]:
df_bsn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160585 entries, 0 to 160584
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   160585 non-null  object 
 1   name          160585 non-null  object 
 2   address       160585 non-null  object 
 3   city          160585 non-null  object 
 4   state         160585 non-null  object 
 5   postal_code   160585 non-null  object 
 6   latitude      160585 non-null  float64
 7   longitude     160585 non-null  float64
 8   stars         160585 non-null  float64
 9   review_count  160585 non-null  int64  
 10  is_open       160585 non-null  int64  
 11  attributes    145593 non-null  object 
 12  categories    160470 non-null  object 
 13  hours         133244 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 17.2+ MB


In [22]:
bsn_rstnt = df_bsn[df_bsn['categories'].str.contains('Restaurants', na=False)]

In [23]:
bsn_rstnt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50763 entries, 0 to 160584
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   50763 non-null  object 
 1   name          50763 non-null  object 
 2   address       50763 non-null  object 
 3   city          50763 non-null  object 
 4   state         50763 non-null  object 
 5   postal_code   50763 non-null  object 
 6   latitude      50763 non-null  float64
 7   longitude     50763 non-null  float64
 8   stars         50763 non-null  float64
 9   review_count  50763 non-null  int64  
 10  is_open       50763 non-null  int64  
 11  attributes    50314 non-null  object 
 12  categories    50763 non-null  object 
 13  hours         42875 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 5.8+ MB


### Merge Review.json and Business.json files

In [30]:
chunk_list = []

for chunk_review in review:
    
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['review_id','useful'], axis=1)
    
    # Renaming column name to avoid conflict with business overall star rating
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    
    # Inner merge with edited business file so only reviews related to the business remain
    chunk_merged = pd.merge(bsn_rstnt, chunk_review, on='business_id', how='inner')

    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)

# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

644786 out of 1,000,000 related reviews
651022 out of 1,000,000 related reviews
643531 out of 1,000,000 related reviews
631384 out of 1,000,000 related reviews
642933 out of 1,000,000 related reviews
649318 out of 1,000,000 related reviews
651717 out of 1,000,000 related reviews
406437 out of 1,000,000 related reviews


### Convert the merged dataset into a CSV file

In [31]:
csv_name = "yelp_Restaurant_reviews.csv"
df.to_csv(csv_name, index=False)