In [1]:
# import libraries

import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def read_chunks(file, cols=None, city=None, chunk_size=500):
    '''
    Read dataset in chunks
    '''
    df = pd.read_json(path_or_buf=file, chunksize=chunk_size, lines=True)
    chunk_list = []
    for chunk in tqdm(df, desc=file):
        if city:
            chunk = chunk[chunk['city'] == city]
        if cols is None:
            chunk_list.append(chunk)
        else:
            chunk_list.append(chunk[cols])
    return pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [3]:
# choose city of interest

city_of_interest = 'St. Louis'

In [None]:
# read data

business_data = read_chunks('yelp_dataset/yelp_academic_dataset_business.json', city=city_of_interest)
business_ids = business_data['business_id'].unique()

checkin_data = read_chunks('yelp_dataset/yelp_academic_dataset_checkin.json')
checkin_data = checkin_data[checkin_data['business_id'].isin(business_ids)]

review_data = read_chunks('yelp_dataset/yelp_academic_dataset_review.json')
review_data = review_data[review_data['business_id'].isin(business_ids)]

tip_data = read_chunks('yelp_dataset/yelp_academic_dataset_tip.json')
tip_data = tip_data[tip_data['business_id'].isin(business_ids)]

user_data = read_chunks('yelp_dataset/yelp_academic_dataset_user.json')
users_of_interest = set(review_data['user_id']).union(set(tip_data['user_id']))
user_data = user_data[user_data['user_id'].isin(users_of_interest)]

In [7]:
# save data in csv format in filtered_cities folder

business_data.to_csv(f'filtered_cities/{city_of_interest}_business.csv', index=False)
checkin_data.to_csv(f'filtered_cities/{city_of_interest}_checkin.csv', index=False)
review_data.to_csv(f'filtered_cities/{city_of_interest}_review.csv', index=False)
tip_data.to_csv(f'filtered_cities/{city_of_interest}_tip.csv', index=False)
user_data.to_csv(f'filtered_cities/{city_of_interest}_user.csv', index=False)