In [1]:
import json
import pandas as pd

In [2]:
# Raw reviews data file as received from Yelp
all_reviews_file = '../raw-data/yelp_academic_dataset_review.json'

# First 10 lines of the raw reviews data file just for testing purposes
first_10_file = '../preprocessed-data/first_10_reviews.json'

In [3]:
def get_full_df(json_file_name, n_rows=None):
    """ Returns a pandas.DataFrame with all data from first <n_rows> rows of <json_file_name>,
    if <n_rows> is defined, or all data from the whole file if <n_rows> is not defined """
    with open(json_file_name, 'r') as f:
        if n_rows is None:
            df = pd.DataFrame([json.loads(line) for line in f])
        else:
            df = pd.DataFrame([json.loads(next(f)) for i in range(n_rows)])
    return df

In [4]:
def _get_dict_from_line(line):
    """ Helper function for get_text_and_stars_df() that gets a dictionary of just
    the text and number of stars from the full dictionary for one review """
    review_dict = json.loads(line)
    sub_dict = {'text': review_dict['text'], 'stars': review_dict['stars']}
    return sub_dict

def get_text_and_stars_df(json_file_name, n_rows=None):
    """ Returns a pandas.DataFrame with just the review text and number of stars
    for each review from first <n_rows> rows of <json_file_name> if <n_rows> is defined,
    or from the whole file if <n_rows> is not defined"""
    with open(json_file_name, 'r') as f:
        if n_rows is None:
            df = pd.DataFrame([_get_dict_from_line(line) for line in f])
        else:
            df = pd.DataFrame([_get_dict_from_line(next(f)) for i in range(n_rows)])
    return df

# Test dataframe generators

In [5]:
df = get_full_df(first_10_file)
df.head(10)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,type,useful,user_id
0,2aFiy99vNLklCx3T_tGS9A,0,2011-10-10,0,NxL8SIC5yqOdnlXCg18IBg,5,If you enjoy service by someone who is as comp...,review,0,KpkOkG6RIf4Ra25Lhhxf1A
1,2aFiy99vNLklCx3T_tGS9A,0,2010-12-29,0,pXbbIgOXvLuTi_SPs1hQEQ,5,After being on the phone with Verizon Wireless...,review,1,bQ7fQq1otn9hKX-gXRsrgA
2,2aFiy99vNLklCx3T_tGS9A,0,2011-04-29,0,wslW2Lu4NYylb1jEapAGsw,5,Great service! Corey is very service oriented....,review,0,r1NUhdNmL6yU9Bn-Yx6FTw
3,2LfIuF3_sX6uwe-IR-P0jQ,1,2014-07-14,0,GP6YEearUWrzPtQYSF1vVg,5,Highly recommended. Went in yesterday looking ...,review,0,aW3ix1KNZAvoM8q-WghA3Q
4,2LfIuF3_sX6uwe-IR-P0jQ,0,2014-01-15,0,25RlYGq2s5qShi-pn3ufVA,4,I walked in here looking for a specific piece ...,review,0,YOo-Cip8HqvKp_p9nEGphw
5,2LfIuF3_sX6uwe-IR-P0jQ,1,2013-04-28,0,Uf1Ki1yyH_JDKhLvn2e4FQ,5,What a great place! Modern on Melrose has amaz...,review,2,bgl3j8yJcRO-00NkUYsXGQ
6,2LfIuF3_sX6uwe-IR-P0jQ,0,2014-10-12,0,oFmVZh-La7SuvpHrH_Al4Q,4,A hidden gem! Found a beautiful buffet for a g...,review,0,CWKF9de-nskLYEqDDCfubg
7,2LfIuF3_sX6uwe-IR-P0jQ,0,2012-09-18,0,bRvdVt88MJ_YMTlLbjDLxQ,5,This place is a great for those vintage/mid ce...,review,2,GJ7PTY7huYORFKKg3db3Gw
8,2LfIuF3_sX6uwe-IR-P0jQ,0,2015-10-11,0,zNUSxqflZKgKD1NQH3jdFA,5,This is the place to go for all your Mid Centu...,review,0,rxqp9eXZj1jYTn0UIsm3Hg
9,2LfIuF3_sX6uwe-IR-P0jQ,0,2015-04-05,0,LkP1l7sZIwOV6IKNLqQp_A,5,"Great items at a good price. Helpful, easy to...",review,0,UU0nHQtHPMAfLidk8tOHTg


In [6]:
df = get_full_df(first_10_file, 5)
df.head(10)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,type,useful,user_id
0,2aFiy99vNLklCx3T_tGS9A,0,2011-10-10,0,NxL8SIC5yqOdnlXCg18IBg,5,If you enjoy service by someone who is as comp...,review,0,KpkOkG6RIf4Ra25Lhhxf1A
1,2aFiy99vNLklCx3T_tGS9A,0,2010-12-29,0,pXbbIgOXvLuTi_SPs1hQEQ,5,After being on the phone with Verizon Wireless...,review,1,bQ7fQq1otn9hKX-gXRsrgA
2,2aFiy99vNLklCx3T_tGS9A,0,2011-04-29,0,wslW2Lu4NYylb1jEapAGsw,5,Great service! Corey is very service oriented....,review,0,r1NUhdNmL6yU9Bn-Yx6FTw
3,2LfIuF3_sX6uwe-IR-P0jQ,1,2014-07-14,0,GP6YEearUWrzPtQYSF1vVg,5,Highly recommended. Went in yesterday looking ...,review,0,aW3ix1KNZAvoM8q-WghA3Q
4,2LfIuF3_sX6uwe-IR-P0jQ,0,2014-01-15,0,25RlYGq2s5qShi-pn3ufVA,4,I walked in here looking for a specific piece ...,review,0,YOo-Cip8HqvKp_p9nEGphw


In [7]:
df = get_text_and_stars_df(first_10_file)
df.head(10)

Unnamed: 0,stars,text
0,5,If you enjoy service by someone who is as comp...
1,5,After being on the phone with Verizon Wireless...
2,5,Great service! Corey is very service oriented....
3,5,Highly recommended. Went in yesterday looking ...
4,4,I walked in here looking for a specific piece ...
5,5,What a great place! Modern on Melrose has amaz...
6,4,A hidden gem! Found a beautiful buffet for a g...
7,5,This place is a great for those vintage/mid ce...
8,5,This is the place to go for all your Mid Centu...
9,5,"Great items at a good price. Helpful, easy to..."


In [8]:
df = get_text_and_stars_df(first_10_file, 5)
df.head(10)

Unnamed: 0,stars,text
0,5,If you enjoy service by someone who is as comp...
1,5,After being on the phone with Verizon Wireless...
2,5,Great service! Corey is very service oriented....
3,5,Highly recommended. Went in yesterday looking ...
4,4,I walked in here looking for a specific piece ...


# Measure read-in time for different amounts of lines read in

In [9]:
# n_vals = [10000, 50000, 100000, 500000]
n_vals = [10000, 50000, 100000]
for n in n_vals:
    print('n = {}'.format(n))
    %timeit -r1 get_full_df(all_reviews_file, n)

# n = 10000
# 1 loop, best of 3: 212 ms per loop
# n = 50000
# 1 loop, best of 3: 1.03 s per loop
# n = 100000
# 1 loop, best of 3: 2.1 s per loop
# n = 500000
# 1 loop, best of 3: 19.7 s per loop

n = 10000
1 loop, best of 1: 365 ms per loop
n = 50000
1 loop, best of 1: 1.11 s per loop
n = 100000
1 loop, best of 1: 2.29 s per loop


When reading in the all the data from each line, my read-in time breaks from linear scaling with $n$ between $n=100000$ and $n=500000$

In [10]:
# n_vals = [10000, 50000, 100000, 500000, 1000000]
n_vals = [10000, 50000, 100000, 500000]
for n in n_vals:
    print('n = {}'.format(n))
    %timeit -r1 get_text_and_stars_df(all_reviews_file, n)

# n = 10000
# 1 loop, best of 1: 193 ms per loop
# n = 50000
# 1 loop, best of 1: 947 ms per loop
# n = 100000
# 1 loop, best of 1: 2.13 s per loop
# n = 500000
# 1 loop, best of 1: 9.55 s per loop
# n = 1000000
# 1 loop, best of 1: 47.4 s per loop

n = 10000
10 loops, best of 1: 191 ms per loop
n = 50000
1 loop, best of 1: 922 ms per loop
n = 100000
1 loop, best of 1: 1.94 s per loop
n = 500000
1 loop, best of 1: 9.56 s per loop


If I just store the text and stars data from each line, my read-in time breaks from linear scaling with $n$ between $n=500,\!000$ and $n=1,\!000,\!000$. I think this means I'm hitting memory limitations on my computer after 500,000 lines. Since there are 4,153,150 rows of data, memory limitations will play a major role in analyzing these data. This means we'll need to use methods that are smart about what is stored and when to avoid overloading memory.

In [11]:
df = get_text_and_stars_df(all_reviews_file, 500000)
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 2 columns):
stars    500000 non-null int64
text     500000 non-null object
dtypes: int64(1), object(1)
memory usage: 336.7 MB


With 500,000 rows, the dataframe is 336.7 MB. The full dataframe would be about 2.8 GB.