## Importing Libraries

In [47]:
import json
import ijson
import pandas as pd
from bisect import bisect_left
import re

## Business Dataset

In [84]:
businesses = pd.read_json('business.json', lines = True)
restaurants = businesses.drop(businesses[businesses.categories.str.contains('Restaurants') == False].index) # 59853 Restaurants
restaurants = restaurants[['business_id', 'name', 'city', 'state', 'review_count', 'stars']]
restaurants.to_csv('restaurants.csv', index = False)

## Review Dataset

Functions to optimize searching for intersection of restaurants and reviews

In [85]:
def encode_business_id(string_id):
    encoded_id = ''
    for char in string_id:
        if char.isdigit(): encoded_id += char
        else: encoded_id += str(ord(char))
    return encoded_id

restaurant_business_ids = sorted([encode_business_id(i) for i in list(restaurants['business_id'])])

def binary_search(sorted_list, target): 
    i = bisect_left(sorted_list, target) 
    if i != len(sorted_list) and sorted_list[i] == target: 
        return i 
    else: 
        return None

Exporting full dataset to csv (estimated time: ~ 14 hours)

In [None]:
# reviews: 'review_id', 'user_id', 'business_id', 'stars', 'date'
chunk_size = 10000
chunk_df = pd.DataFrame(columns=['review_id', 'user_id', 'business_id', 'stars', 'date'])

chunk_count = 0
num_checked = 0
with open('review.json', 'r', errors = 'ignore') as file:
    for line in file:
        if chunk_count > chunk_size:
            chunk_df.to_csv('restaurant_reviews.csv', mode = 'a', header = False)
            chunk_df.drop(chunk_df.index, inplace=True)
            num_checked += chunk_count
            print(str(num_checked / 6685900))
            chunk_count = 0
        review = json.loads(line)
        if binary_search(restaurant_business_ids, encode_business_id(review['business_id'])) != None:
            chunk_count += 1
            chunk_df = chunk_df.append({'review_id': review['review_id'], 
                             'user_id': review['user_id'],
                             'business_id': review['business_id'],
                             'stars': review['stars'],
                             'date': review['date']}, ignore_index = True)

In [None]:
restaurant_business_ids = list(restaurants['business_id'])[:1000]
reviews_df = pd.DataFrame(columns = ['review_id', 'user_id', 'business_id', 'stars', 'date'])
num_checked = 0
with open('review.json', 'r', errors = 'ignore') as file:
    for line in file:
        review = json.loads(line)
        num_checked += 1
        if num_checked % 10000 == 0:
            print(str(num_checked / 66859)[:4] + '% done, ' + str(len(reviews_df.index)) + ' reviews in dataframe')
        if review['business_id'] in restaurant_business_ids:
            reviews_df = reviews_df.append({'review_id': review['review_id'], 
                                            'user_id': review['user_id'],
                                            'business_id': review['business_id'],
                                            'stars': review['stars'],
                                            'date': review['date']}, ignore_index = True)

0.14% done, 1146 reviews in dataframe
0.29% done, 2261 reviews in dataframe
0.44% done, 3386 reviews in dataframe
0.59% done, 4517 reviews in dataframe
0.74% done, 5632 reviews in dataframe
0.89% done, 6707 reviews in dataframe
1.04% done, 7814 reviews in dataframe
1.19% done, 8971 reviews in dataframe
1.34% done, 10070 reviews in dataframe
1.49% done, 11218 reviews in dataframe
1.64% done, 12346 reviews in dataframe
1.79% done, 13450 reviews in dataframe
1.94% done, 14502 reviews in dataframe
2.09% done, 15623 reviews in dataframe
2.24% done, 16726 reviews in dataframe
2.39% done, 17822 reviews in dataframe
2.54% done, 18944 reviews in dataframe
2.69% done, 20065 reviews in dataframe
2.84% done, 21174 reviews in dataframe
2.99% done, 22322 reviews in dataframe
3.14% done, 23426 reviews in dataframe
3.29% done, 24531 reviews in dataframe
3.44% done, 25572 reviews in dataframe
3.58% done, 26636 reviews in dataframe
3.73% done, 27755 reviews in dataframe
3.88% done, 28856 reviews in data

In [None]:
# users: 'user_id', 'review_count', 'elite', 'yelping_since', 'average_stars'