In [1]:
import json
import pandas as pd
import random

In [2]:
US_CANADA_REGION_BY_STATE_DICT = {
    'AZ': 'Phoenix',
    'NV': 'Las Vegas',
    'ON': 'Toronto',
    'NC': 'Charlotte',
    'SC': 'Charlotte',
    'OH': 'Cleveland',
    'PA': 'Pittsburgh',
    'QC': 'Montreal',
    'NY': 'Montreal',
    'VT': 'Montreal',
    'WI': 'Madison',
    'IL': 'Champaign'
}

In [3]:
# Raw data files as received from Yelp
ALL_REVIEWS_FILE    = '../raw-data/yelp_academic_dataset_review.json'
ALL_BUSINESSES_FILE = '../raw-data/yelp_academic_dataset_business.json'

# Paths for the train and test data to be generated
REVIEWS_TRAIN_FILE    = '../preprocessed-data/reviews_train.json'
REVIEWS_TEST_FILE     = '../preprocessed-data/reviews_test.json'
BUSINESSES_TRAIN_FILE = '../preprocessed-data/businesses_train.json'
BUSINESSES_TEST_FILE  = '../preprocessed-data/businesses_test.json'

In [4]:
def get_df(json_file_name, max_rows=None, city=None, select_keys=None, us_canada_only=True):
    """ Return dataframe from raw data.
    All rows unless max_rows is set. All cities unless city is set. All columns unless select_keys is set.
    Only businesses in US/Canada if us_canada_only is true
    """
    with open(json_file_name, 'r') as f:
        i_row = 0
        df_dict_list = []
        for line in f:
            row_dict = json.loads(line)
            row_city = row_dict.get('city', '')
            if us_canada_only:
                if row_dict['state'] not in US_CANADA_REGION_BY_STATE_DICT.keys():
                    continue
            if select_keys is not None:
                row_dict = {k: row_dict[k] for k in select_keys}
            if (city is None) or (city == row_city):
                df_dict_list.append(row_dict)
                i_row += 1
            if (max_rows is not None) and (i_row >= max_rows):
                break
        df = pd.DataFrame(df_dict_list)
        df = df.set_index('business_id')
        return df

# Here's what we're going to do:

1. Load all the US/Canada businesses into a dataframe which we can use to compare business IDs from reviews to see if we keep the review or not.
2. Loop over all the reviews data.
  - If the review is for a non-US/Canada business, skip to the next one.
  - If the review is for a US/Canada business, copy that line to the train data file with 90% chance or the test data file with 10% chance. Use a fixed random seed so the results are reproducible. 

In [5]:
def split_reviews_90_10(all_reviews_file, reviews_train_file, reviews_test_file, us_canada_business_id_list):
    """
    Given the paths to the raw reviews file, the output train and test files, and a list of all US/Canada business IDs,
    copy 90% of the data for US/Canada businesses to the train file, and 10% of them to the test file.
    """
    
    # Seed the random number generator with a constant 0 seed for reproducible results
    random.seed(0)
    
    # Open all 3 files
    with open(all_reviews_file, 'r') as f_full, \
            open(reviews_train_file, 'w') as f_train, \
            open(reviews_test_file, 'w') as f_test:
        total_lines_read = 0
        train_lines_written = 0
        test_lines_written = 0
        for line in f_full:
            total_lines_read += 1
            # Determine the state of the business being reviewed
            line_dict = json.loads(line)
            business_id = line_dict['business_id']
            # Skip review if business is not in US/Canada
            if business_id not in us_canada_business_id_list:
                continue
            # Write line to train or test file
            rand_int = random.randint(1, 10)
            if rand_int == 1:
                f_test.write(line)
                test_lines_written += 1
            else:
                f_train.write(line)
                train_lines_written += 1
                
    us_canada_lines = test_lines_written + train_lines_written
    print('Number of total reviews: {}'.format(total_lines_read))
    print('Number of US/Canada reviews: {}'.format(us_canada_lines))
    print('Number of train reviews: {}'.format(train_lines_written))
    print('Number of test reviews: {}  ({:.2f}%)'.format(test_lines_written, 100*test_lines_written/us_canada_lines))

In [6]:
businesses_df = get_df(ALL_BUSINESSES_FILE)
businesses_df.head()

Unnamed: 0_level_0,address,attributes,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,type
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0DI8Dt2PJp07XkVvIElIcQ,"227 E Baseline Rd, Ste J2","[BikeParking: True, BusinessAcceptsBitcoin: Fa...","[Tobacco Shops, Nightlife, Vape Shops, Shopping]",Tempe,"[Monday 11:0-21:0, Tuesday 11:0-21:0, Wednesda...",0,33.378214,-111.936102,Innovative Vapors,,85283,17,4.5,AZ,business
LTlCaCGZE14GuaUXUGbamg,495 S Grand Central Pkwy,"[BusinessAcceptsBitcoin: False, BusinessAccept...","[Caterers, Grocery, Food, Event Planning & Ser...",Las Vegas,"[Monday 0:0-0:0, Tuesday 0:0-0:0, Wednesday 0:...",1,36.192284,-115.159272,Cut and Taste,,89106,9,5.0,NV,business
EDqCEAGXVGCH4FJXgqtjqg,979 Bloor Street W,"[Alcohol: none, Ambience: {'romantic': False, ...","[Restaurants, Pizza, Chicken Wings, Italian]",Toronto,"[Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ...",1,43.661054,-79.429089,Pizza Pizza,Dufferin Grove,M6H 1L5,7,2.5,ON,business
cnGIivYRLxpF7tBVR_JwWA,7014 Steubenville Pike,"[AcceptsInsurance: False, BusinessAcceptsCredi...","[Hair Removal, Beauty & Spas, Blow Dry/Out Ser...",Oakdale,"[Tuesday 10:0-21:0, Wednesday 10:0-21:0, Thurs...",1,40.444544,-80.17454,Plush Salon and Spa,,15071,4,4.0,PA,business
cdk-qqJ71q6P7TJTww_DSA,321 Jarvis Street,"[BusinessAcceptsCreditCards: True, Restaurants...","[Hotels & Travel, Event Planning & Services, H...",Toronto,,1,43.659829,-79.375401,Comfort Inn,Downtown Core,M5B 2C2,8,3.0,ON,business


In [7]:
%%time
split_reviews_90_10(ALL_REVIEWS_FILE, REVIEWS_TRAIN_FILE, REVIEWS_TEST_FILE, businesses_df.index)

Number of total reviews: 4153150
Number of US/Canada reviews: 4071912
Number of train reviews: 3663890
Number of test reviews: 408022  (10.02%)
CPU times: user 1min 45s, sys: 10.6 s, total: 1min 55s
Wall time: 6min 50s
