# Get Data by Metro Area

## Load Packages

In [1]:
# Import packages to use later
import pandas as pd
import nltk
import time
import json

## Investigate Cities

In [3]:
businesses = pd.read_csv("YelpChallengeWMetros_Clean.csv", encoding='latin-1')
businesses = businesses.rename(columns={'business': 'business_id'})

d = dict(businesses['metroarea'].value_counts())

for city, count in sorted(d.items(), key=lambda x: x[1], reverse=True):
    if count > 1000:
        print(city, count)

Boston 36019
Portland 28301
Austin 24487
Orlando 21912
Atlanta 18092
Vancouver 17305
Columbus 11260
Boulder 3199


In [4]:
businesses.head(3)

Unnamed: 0,business_id,name,categories,stars,review_count,address,city,state,longitude,latitude,metroarea
0,N3_Gs3DnX4k9SgpwJxdEfw,Lane Wells Jewelry Repair,"Shopping, Jewelry Repair, Appraisal Services, ...",5,30,"7801 N Lamar Blvd, Ste A140",Austin,TX,-97.711458,30.346169,Austin
1,NVfOn7TdnHbaGH97CVB_Qg,McKinley Chiropractic,"Chiropractors, Health & Medical",5,5,"5625 Eiger Rd, Ste 160",Austin,TX,-97.857409,30.244902,Austin
2,Xw8tuI30T-xihpzwBV-zJg,El Pollo Rey,"Food Trucks, Restaurants, Specialty Food, Food...",5,11,1725 E Riverside Dr,Austin,TX,-97.730141,30.243493,Austin


## Dataset for Each Metro

In [5]:
# These cities contain almost all reviews in the dataset (removing outlier entries)
metros = [city for city, count in sorted(d.items(), key=lambda x: x[1], reverse=True)]

# Initialize empty dataframe for each city
metro_reviews = {metro:pd.DataFrame() for metro in metros}

# Dictionary of city IDs to set of business IDs in that city
metro_businesses = {metro:set(businesses[businesses['metroarea'] == metro]['business_id'].tolist()) for metro in metros}

num_total_reviews = 0

with pd.read_csv("yelp_academic_dataset_review.csv", chunksize=100000) as reader:
    for count, chunk in enumerate(reader):
        #####
        chunk['datetime'] = pd.to_datetime(chunk['date'])
        chunk = chunk[chunk['datetime'] >= pd.Timestamp('2010-01-01')]
        chunk = chunk[chunk['datetime'] < pd.Timestamp('2020-01-01')]
        #print(chunk.head(5))
        #####
        num_total_reviews += len(chunk)
        for metro in metros:
            vals = chunk[chunk.business_id.isin(metro_businesses[metro])]
            metro_reviews[metro] = pd.concat([metro_reviews[metro], vals], axis=0)
        if count % 10 == 0:
            print("Chunk #", count)
        #break
    print("End: Chunk #", count)

print(num_total_reviews)
    
for metro in metros:
    #metro_reviews[metro].to_csv("small_reviews/yelp_academic_dataset_reviews_" + metro + ".csv")
    metro_reviews[metro].to_csv("small_reviews_urbcomp/yelp_academic_dataset_reviews_" + metro + ".csv")

FileNotFoundError: [Errno 2] File b'yelp_academic_dataset_review.csv' does not exist: b'yelp_academic_dataset_review.csv'

## Checking Results

In [12]:
print(metros)

['Boston', 'Portland', 'Austin', 'Orlando', 'Atlanta', 'Vancouver', 'Columbus', 'Boulder']


In [13]:
lengths = [len(metro_reviews[metro]) for metro in metros]
print(lengths)
print(sum(lengths))
print(num_total_reviews)

[1731248, 1367612, 1289078, 990606, 987780, 564826, 375874, 123229]
7430253
7522592
