In [1]:
import json
import pandas as pd
import gmaps
import gmaps.datasets
gmaps.configure(api_key="AIzaSyA8dLxSuYgpBzYH6aQ3jdHUXnhCVgfdTQg")

# TL;DR

The main finding of this notebook is that since only businesses around several distinct city hubs were included in the dataset, we can easily assign regions to every US/Canada business based on the state it's in. Here's a dictionary of regions for each US/Canada state included in the dataset:

In [2]:
US_CANADA_REGION_BY_STATE_DICT = {
    'AZ': 'Phoenix',
    'NV': 'Las Vegas',
    'ON': 'Toronto',
    'NC': 'Charlotte',
    'SC': 'Charlotte',
    'OH': 'Cleveland',
    'PA': 'Pittsburgh',
    'QC': 'Montreal',
    'NY': 'Montreal',
    'VT': 'Montreal',
    'WI': 'Madison',
    'IL': 'Champaign'
}

In [3]:
# Raw reviews data file as received from Yelp
all_reviews_file = '../raw-data/yelp_academic_dataset_review.json'
all_businesses_file = '../raw-data/yelp_academic_dataset_business.json'

In [4]:
def get_df(json_file_name, max_rows=None, city=None, select_keys=None, us_canada_only=True):
    """ Return dataframe from raw data.
    All rows unless max_rows is set. All cities unless city is set. All columns unless select_keys is set.
    Only businesses in US/Canada if us_canada_only is true
    """
    with open(json_file_name, 'r') as f:
        i_row = 0
        df_dict_list = []
        for line in f:
            row_dict = json.loads(line)
            row_city = row_dict.get('city', '')
            if us_canada_only:
                if row_dict['state'] not in US_CANADA_REGION_BY_STATE_DICT.keys():
                    continue
            if select_keys is not None:
                row_dict = {k: row_dict[k] for k in select_keys}
            if (city is None) or (city == row_city):
                df_dict_list.append(row_dict)
                i_row += 1
            if (max_rows is not None) and (i_row >= max_rows):
                break
        df = pd.DataFrame(df_dict_list)
        return df

In [5]:
df = get_df(all_businesses_file, select_keys=['city', 'state', 'latitude', 'longitude'], us_canada_only=False)
df.head()

Unnamed: 0,city,latitude,longitude,state
0,Tempe,33.378214,-111.936102,AZ
1,Las Vegas,36.192284,-115.159272,NV
2,Toronto,43.661054,-79.429089,ON
3,Oakdale,40.444544,-80.17454,PA
4,Toronto,43.659829,-79.375401,ON


In [6]:
# Print all unique states from most business listings to least

states_series = df['state'].value_counts()

print('{} unique state abbrevs:'.format(len(states_series)))

for state, n_businesses in zip(states_series.index, states_series):
    print('{:50} {:6}'.format(state, n_businesses))

29 unique state abbrevs:
AZ                                                  43492
NV                                                  28214
ON                                                  24507
NC                                                  10177
OH                                                   9966
PA                                                   8091
QC                                                   6668
WI                                                   3899
EDH                                                  3539
BW                                                   2905
IL                                                   1556
SC                                                    498
MLN                                                   191
HLD                                                   172
FIF                                                    72
ELN                                                    36
WLN                                            

In [7]:
df = get_df(all_businesses_file, select_keys=['city', 'state', 'latitude', 'longitude'], us_canada_only=True)
df.head()

Unnamed: 0,city,latitude,longitude,state
0,Tempe,33.378214,-111.936102,AZ
1,Las Vegas,36.192284,-115.159272,NV
2,Toronto,43.661054,-79.429089,ON
3,Oakdale,40.444544,-80.17454,PA
4,Toronto,43.659829,-79.375401,ON


In [8]:
# Print all unique states from most business listings to least

states_series = df['state'].value_counts()

print('{} unique state abbrevs:'.format(len(states_series)))

for state, n_businesses in zip(states_series.index, states_series):
    print('{:50} {:6}'.format(state, n_businesses))

12 unique state abbrevs:
AZ                                                  43492
NV                                                  28214
ON                                                  24507
NC                                                  10177
OH                                                   9966
PA                                                   8091
QC                                                   6668
WI                                                   3899
IL                                                   1556
SC                                                    498
NY                                                     13
VT                                                      1


In [9]:
locations = list(zip(df['latitude'], df['longitude']))

# Heat Map of All US/Canada Businesses

In [10]:
m = gmaps.Map()
m.add_layer(gmaps.heatmap_layer(locations))
m

Despite the variety of cities and states, it does look like there are only a few distinct city hubs included in the dataset as promised. Here are the hubs in US/Canada:
- Las Vegas
- Phoenix
- Madison
- Champaign
- Cleveland
- Toronto
- Montreal
- Pittsburgh
- Charlotte

It will be useful to assign a region to each business to clean things up. Let's see if we can do that just with state and city data.

In [11]:
df_sub = df[df['state'] == 'NV']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

28214 Businesses


Looks like anything in NV belongs to the Las Vegas region

In [12]:
df_sub = df[df['state'] == 'AZ']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

43492 Businesses


Anything in AZ belongs to the Phoenix region

In [13]:
df_sub = df[df['state'] == 'WI']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

3899 Businesses


Anything in WI belongs to the Madison region

In [14]:
df_sub = df[df['state'] == 'IL']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

1556 Businesses


Anything in IL belongs to the Champaign region

In [15]:
df_sub = df[df['state'] == 'OH']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

9966 Businesses


OH = Cleveland

In [16]:
df_sub = df[df['state'] == 'ON']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

24507 Businesses


ON = Toronto

In [17]:
df_sub = df[df['state'] == 'QC']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

6668 Businesses


QC = Montreal

In [18]:
df_sub = df[df['state'] == 'PA']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

8091 Businesses


PA = Pittsburgh

In [19]:
df_sub = df[df['state'] == 'NC']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

10177 Businesses


NC = Charlotte

In [20]:
df_sub = df[df['state'] == 'SC']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

498 Businesses


SC also = Charlotte

In [21]:
df_sub = df[df['state'] == 'NY']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

13 Businesses


NY = Montreal

In [22]:
df_sub = df[df['state'] == 'VT']
print('{} Businesses'.format(len(df_sub)))
locations_sub = list(zip(df_sub['latitude'], df_sub['longitude']))
m_sub = gmaps.Map()
m_sub.add_layer(gmaps.heatmap_layer(locations_sub))
m_sub

1 Businesses


VT also = Montreal

# Conclusion

Fortunately, each state only has businesses associated with one "region". All regions except Montreal and Charlotte only have businesses in one state. The Montreal region has a few businesses in NY and 1 in VT, and Charlotte has a couple hundred businesses in SC. Thus, based on only state data, we can assign a region to each business.