## Importing Libraries

In [127]:
import json
import pandas as pd
import time
%matplotlib inline
import matplotlib.pyplot as plt

## Business Dataset

In [115]:
businesses = pd.read_json('business.json', lines = True)
restaurants = businesses.drop(businesses[businesses.categories.str.contains('Restaurants') == False].index) # 59853 Restaurants
restaurants = restaurants[['business_id', 'name', 'city', 'state', 'review_count', 'stars']]
restaurants.to_csv('restaurants.csv', index = False)

## Review Dataset

Exporting full dataset to csv (estimated time: ~ 6 hours)

In [109]:
# reviews: 'review_id', 'user_id', 'business_id', 'stars', 'date'
header_df = pd.DataFrame(columns=['review_id', 'user_id', 'business_id', 'stars', 'date'])
header_df.to_csv('restaurant_reviews.csv')
restaurant_business_ids = list(restaurants['business_id'])

chunk_size = 100000
num_checked = 0
num_reviews = 0
i = 0
chunk_dict = {}

with open('review.json', 'r', errors = 'ignore') as file:
    for line in file:
        if num_checked % chunk_size == 0:
            chunk_df = pd.DataFrame.from_dict(chunk_dict, orient = 'index')
            chunk_df.to_csv('restaurant_reviews.csv', mode = 'a', header = False, index = False)
            chunk_dict = {}
            chunk_df.drop(chunk_df.index, inplace=True)
            i = 0
            print(str(num_checked / 66859)[:4] + '% done, ' + str(num_reviews) + ' reviews in dataframe at ' + time.ctime())
        review = json.loads(line)
        num_checked += 1
        if review['business_id'] in restaurant_business_ids:
            i += 1
            num_reviews += 1
            chunk_dict[i] = {'review_id': review['review_id'], 
                             'user_id': review['user_id'],
                             'business_id': review['business_id'],
                             'stars': review['stars'],
                             'date': review['date']}

0.0% done, 0 reviews in dataframe at Tue Feb 12 02:21:05 2019
1.49% done, 63693 reviews in dataframe at Tue Feb 12 02:24:25 2019
2.99% done, 127232 reviews in dataframe at Tue Feb 12 02:27:25 2019
4.48% done, 190826 reviews in dataframe at Tue Feb 12 02:30:23 2019
5.98% done, 254308 reviews in dataframe at Tue Feb 12 02:33:32 2019
7.47% done, 317776 reviews in dataframe at Tue Feb 12 02:37:44 2019
8.97% done, 379203 reviews in dataframe at Tue Feb 12 02:41:46 2019
10.4% done, 440632 reviews in dataframe at Tue Feb 12 02:46:01 2019
11.9% done, 504964 reviews in dataframe at Tue Feb 12 02:49:22 2019
13.4% done, 569422 reviews in dataframe at Tue Feb 12 02:52:41 2019
14.9% done, 633903 reviews in dataframe at Tue Feb 12 02:55:56 2019
16.4% done, 698307 reviews in dataframe at Tue Feb 12 02:59:12 2019
17.9% done, 762679 reviews in dataframe at Tue Feb 12 03:02:30 2019
19.4% done, 823016 reviews in dataframe at Tue Feb 12 03:06:07 2019
20.9% done, 884756 reviews in dataframe at Tue Feb 12 0

Taking sample of restaurants and getting all the respective reviews

In [98]:
restaurant_business_ids = list(restaurants['business_id'])[:100]
reviews_df = pd.DataFrame(columns = ['review_id', 'user_id', 'business_id', 'stars', 'date'])
num_checked = 0
with open('review.json', 'r', errors = 'ignore') as file:
    for line in file:
        review = json.loads(line)
        num_checked += 1
        if num_checked % 10000 == 0:
            print(str(num_checked / 66859)[:4] + '% done, ' + str(len(reviews_df.index)) + ' reviews in dataframe')
        if review['business_id'] in restaurant_business_ids:
            reviews_df = reviews_df.append({'review_id': review['review_id'], 
                                            'user_id': review['user_id'],
                                            'business_id': review['business_id'],
                                            'stars': review['stars'],
                                            'date': review['date']}, ignore_index = True)

0.14% done, 123 reviews in dataframe
0.29% done, 235 reviews in dataframe
0.44% done, 344 reviews in dataframe
0.59% done, 449 reviews in dataframe
0.74% done, 578 reviews in dataframe
0.89% done, 672 reviews in dataframe
1.04% done, 779 reviews in dataframe
1.19% done, 888 reviews in dataframe
1.34% done, 1013 reviews in dataframe
1.49% done, 1118 reviews in dataframe
1.64% done, 1220 reviews in dataframe
1.79% done, 1335 reviews in dataframe
1.94% done, 1436 reviews in dataframe
2.09% done, 1548 reviews in dataframe
2.24% done, 1645 reviews in dataframe
2.39% done, 1747 reviews in dataframe
2.54% done, 1846 reviews in dataframe
2.69% done, 1966 reviews in dataframe
2.84% done, 2063 reviews in dataframe
2.99% done, 2187 reviews in dataframe
3.14% done, 2285 reviews in dataframe
3.29% done, 2397 reviews in dataframe
3.44% done, 2486 reviews in dataframe
3.58% done, 2597 reviews in dataframe
3.73% done, 2739 reviews in dataframe
3.88% done, 2843 reviews in dataframe
4.03% done, 2957 rev

## User Dataset

In [113]:
# users: 'user_id', 'review_count', 'elite', 'yelping_since', 'average_stars'

users_df = pd.DataFrame(columns = ['user_id', 'review_count', 'elite', 'yelping_since', 'avergae_stars'])
users_df.to_csv('users.csv')

chunk_size = 100000
num_users = 0
i = 0
chunk_dict = {}

with open('user.json', 'r', errors = 'ignore') as file:
    for line in file:
        if num_users % chunk_size == 0:
            chunk_df = pd.DataFrame.from_dict(chunk_dict, orient = 'index')
            chunk_df.to_csv('users.csv', mode = 'a', header = False, index = False)
            chunk_dict = {}
            chunk_df.drop(chunk_df.index, inplace=True)
            i = 0
            print(str(num_users / 66859)[:4] + '% done, ' + str(num_users) + ' users in dataframe at ' + time.ctime())
        user = json.loads(line)
        i += 1
        num_users += 1
        chunk_dict[i] = {'user_id': user['user_id'], 
                         'review_count': user['review_count'],
                         'elite': user['elite'],
                         'yelping_since': user['yelping_since'],
                         'average_stars': user['average_stars']}

0.0% done, 0 users in dataframe at Tue Feb 12 14:40:53 2019
1.49% done, 100000 users in dataframe at Tue Feb 12 14:40:59 2019
2.99% done, 200000 users in dataframe at Tue Feb 12 14:41:05 2019
4.48% done, 300000 users in dataframe at Tue Feb 12 14:41:10 2019
5.98% done, 400000 users in dataframe at Tue Feb 12 14:41:17 2019
7.47% done, 500000 users in dataframe at Tue Feb 12 14:41:21 2019
8.97% done, 600000 users in dataframe at Tue Feb 12 14:41:26 2019
10.4% done, 700000 users in dataframe at Tue Feb 12 14:41:30 2019
11.9% done, 800000 users in dataframe at Tue Feb 12 14:41:35 2019
13.4% done, 900000 users in dataframe at Tue Feb 12 14:41:40 2019
14.9% done, 1000000 users in dataframe at Tue Feb 12 14:41:44 2019
16.4% done, 1100000 users in dataframe at Tue Feb 12 14:41:48 2019
17.9% done, 1200000 users in dataframe at Tue Feb 12 14:41:54 2019
19.4% done, 1300000 users in dataframe at Tue Feb 12 14:41:58 2019
20.9% done, 1400000 users in dataframe at Tue Feb 12 14:42:03 2019
22.4% done,

# Analysis

In [135]:
sample_restaurants = restaurants.sample(100)

In [147]:
sample_restaurant_business_ids = list(sample_restaurants['business_id'])
sample_reviews_df = pd.DataFrame(columns = ['review_id', 'user_id', 'business_id', 'stars', 'date'])
num_checked = 0


reviews = pd.read_csv('restaurant_reviews.csv')
for index, review in reviews.iterrows():
    if index % 100000 == 0:
        print(str(index / 41538)[:4] + '% done, ' + str(len(sample_reviews_df.index)) + ' reviews in dataframe at ' + time.ctime())
    if review.business_id in sample_restaurant_business_ids:
        sample_reviews_df = sample_reviews_df.append(review)

0.0% done, 0 reviews in dataframe at Tue Feb 12 15:20:22 2019
2.40% done, 34 reviews in dataframe at Tue Feb 12 15:20:33 2019
4.81% done, 60 reviews in dataframe at Tue Feb 12 15:20:43 2019
7.22% done, 91 reviews in dataframe at Tue Feb 12 15:20:54 2019
9.62% done, 126 reviews in dataframe at Tue Feb 12 15:21:04 2019
12.0% done, 157 reviews in dataframe at Tue Feb 12 15:21:17 2019
14.4% done, 193 reviews in dataframe at Tue Feb 12 15:21:35 2019
16.8% done, 222 reviews in dataframe at Tue Feb 12 15:21:48 2019
19.2% done, 267 reviews in dataframe at Tue Feb 12 15:22:01 2019
21.6% done, 352 reviews in dataframe at Tue Feb 12 15:22:13 2019
24.0% done, 481 reviews in dataframe at Tue Feb 12 15:22:26 2019


In [None]:
sample_reviews_df