In [2]:
import json
import pandas as pd
import cufflinks as cf
from plotly.graph_objs import Table
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot

In [36]:
init_notebook_mode(connected=True)
cf.set_config_file(theme='white')
cf.go_offline()
colorscale = map(lambda x: x, cf.colors.get_scales('accent'))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [4]:
def print_head(f, size=1):
    with open(f) as read_file:
        print([next(read_file) for _ in range(size)])

In [5]:
print_head('review.json')

['{"review_id":"VfBHSwC5Vz_pbFluy07i9Q","user_id":"cjpdDjZyprfyDG3RlkVG3w","business_id":"uYHaNptLzDLoV_JZ_MuzUA","stars":5,"date":"2016-07-12","text":"My girlfriend and I stayed here for 3 nights and loved it. The location of this hotel and very decent price makes this an amazing deal. When you walk out the front door Scott Monument and Princes street are right in front of you, Edinburgh Castle and the Royal Mile is a 2 minute walk via a close right around the corner, and there are so many hidden gems nearby including Calton Hill and the newly opened Arches that made this location incredible.\\n\\nThe hotel itself was also very nice with a reasonably priced bar, very considerate staff, and small but comfortable rooms with excellent bathrooms and showers. Only two minor complaints are no telephones in room for room service (not a huge deal for us) and no AC in the room, but they have huge windows which can be fully opened. The staff were incredible though, letting us borrow umbrellas f

In [6]:
print_head('business.json')

['{"business_id": "YDf95gJZaq05wvo7hTQbbQ", "name": "Richmond Town Square", "neighborhood": "", "address": "691 Richmond Rd", "city": "Richmond Heights", "state": "OH", "postal_code": "44143", "latitude": 41.5417162, "longitude": -81.4931165, "stars": 2.0, "review_count": 17, "is_open": 1, "attributes": {"RestaurantsPriceRange2": 2, "BusinessParking": {"garage": false, "street": false, "validated": false, "lot": true, "valet": false}, "BikeParking": true, "WheelchairAccessible": true}, "categories": ["Shopping", "Shopping Centers"], "hours": {"Monday": "10:00-21:00", "Tuesday": "10:00-21:00", "Friday": "10:00-21:00", "Wednesday": "10:00-21:00", "Thursday": "10:00-21:00", "Sunday": "11:00-18:00", "Saturday": "10:00-21:00"}}\n']


In [7]:
def get_file_size(f):
    line_count = 0
    with open(f, 'r') as read_file:
        return sum(1 for _ in read_file)

In [8]:
print('Total number of businesses:', get_file_size('business.json'))
print('Total number of reviews', get_file_size('review.json'))

Total number of businesses: 156639
Total number of reviews 4736897


## Load Data 

In [9]:
def extract_data(f):
    with open(f, 'r') as read_file:
        for _ in read_file:
            yield json.loads(_)

In [10]:
business_df = pd.io.json.json_normalize(list(extract_data('business.json')))

In [11]:
business_df.shape

(156639, 101)

In [12]:
#edit city
def audit_cities(row, city_map):
    for k in city_map:
        if k in row:
            return row.replace(k, city_map[k])
    return row

In [13]:
city_map = json.loads(open('businessEdits.json', 'r').read())
business_df['city'] = business_df['city'].str.lower().str.strip()
business_df['city'] = business_df['city']\
                        .apply(lambda x: audit_cities(x, city_map))
business_df['state'] = business_df['state'].str.upper().str.strip()
#handle pittsburgh misspelling
business_df.loc[business_df.city == 'pittsburghh', 'city'] = 'pittsburgh'

## State and City Data 

In [14]:
business_df.state.count()

156639

In [15]:
stat_table = Table(
    header = {
        'values': ['Total State/Province Count', 'Total City Count', 'Avg # of Reviews']
    }, 
    cells = {
        'values': [
            [len(business_df.state.unique())], 
            [len(business_df.city.unique())], 
            [round(business_df.review_count.mean(), 0)]
        ], 
        'align': 'middle'
    }
)
iplot([stat_table])

In [41]:
val_df = business_df.state.value_counts().reset_index()
data = [
    {
        'type': 'choropleth',
        'colorscale': [[0, 'rgb(222,235,247)'], [1, 'rgb(8,81,156)']],
        'autocolorscale': False, 
        'locations': val_df['index'].astype(str), 
        'z': val_df.state.astype(float),
        'locationmode': 'USA-states',
        'text': val_df.state,
        'marker': {
            'line': {
                'color': 'rgb(255, 255, 255)',
                'width': 2
            }
        },
        'colorbar': {
            'title': 'Review Count'
        }
    }
]
layout = {
    'title': 'Location of USA Businesses in Dataset', 
    'geo': {
        'scope': 'usa',
        'projection': {
            'type': 'albers usa',
        }
    }
}
fig = {'data': data, 'layout': layout}
iplot(fig)

In [17]:
#top 10 state and city counts
top10_state_counts = business_df.state.value_counts()[:10]
top10_city_counts = business_df.city.value_counts()[:10]
figs = [
    top10_state_counts.iplot(kind='bar', 
                             asFigure=True, 
                             xTitle='State', 
                             yTitle='Count', 
                             color='blue'), 
    top10_city_counts.iplot(kind='bar', 
                              asFigure=True, 
                              xTitle='City', 
                              yTitle='Business Count', 
                              color='blue')
]
cf.iplot(cf.subplots(figs, 
                     subplot_titles=['Top 10 States by Business Count', 
                                     'Top 10 Cities by Business Count']), 
         legend=False)

In [37]:
#top 5 cities by business count in the top 3 states
top5_statecity = business_df[['city', 'state', 'address']].loc[business_df.state.isin(('AZ', 'NV', 'ON'))]\
                             .groupby(['city','state'], as_index=False).address.count()
top5_statecity = top5_statecity.groupby(['state'], group_keys=False)\
                               .apply(lambda x: x.nlargest(5, 'address'))
figs = [
    top5_statecity.loc[top5_statecity.state==s, ('city', 'address')]\
                  .iplot(kind='bar', x='city', y='address', asFigure=True, color=next(colorscale))
    for s in top5_statecity.state.unique()
]
cf.iplot(cf.subplots(figs, 
                     subplot_titles=['Arizona', 'Nevada', 'Ontario'], 
                     shape=(3, 1), 
                     vertical_spacing=0.25),  
         legend=False)

In [None]:
#avg business counts by state

### Business Categories 

In [39]:
category_df = pd.concat([business_df[['city', 'state']], 
                         business_df.categories.apply(lambda x: ','.join(x))], 
                         axis=1)
category_df = pd.concat([category_df, category_df.categories.str.split(',', expand=True)], axis=1)
category_df.drop('categories', axis=1, inplace=True)
melted_category_df = category_df.melt(id_vars=['city', 'state'], 
                                      value_vars = list(filter(lambda x: x not in ['city', 'state'], 
                                                               category_df.columns)), 
                                      value_name='category')
melted_category_df.drop('variable', axis=1, inplace=True)
melted_category_df = melted_category_df[~melted_category_df.category.isnull()]

In [40]:
#top 20 categories
f = melted_category_df.category.value_counts(ascending=False)[:20]\
                                 .iplot(kind='barh', 
                                        color='green', 
                                        title='Top Categories by Business Count', 
                                        xTitle='Business Count',
                                        asFigure=True)
f['layout']['yaxis1']['tickfont']['size'] = 9
iplot(f)

### Star Ratings

### Review Counts 

In [None]:
review_counts = pd.DataFrame({'Review Count': [r.get('review_count') for r in extract_data('business.json')]})

In [None]:
review_counts.describe()

In [None]:
cut_review_outliers = review_counts[review_counts['Review Count'] < review_counts['Review Count'].quantile(.95)]
iplot(cut_review_outliers.iplot(kind='histogram', 
                                asFigure=True, 
                                color='red', 
                                title='Review Count Frequency', 
                                xTitle='Review Count', 
                                yTitle='Frequency'))

In [None]:
business_df.state.value_counts()
# avg_reviewcounts_state = business_df[business_df.state.count() > 1]['city', 'state', 'review_count'].groupby('state').review_count.mean()
# iplot(avg_reviewcounts_state.iplot(
#     kind='bar', 
#     title='Average Review Counts by State', 
#     color='red', 
#     xTitle='State', 
#     yTitle='Avg. Review Count', 
#     asFigure=True))

In [None]:
business_df.loc[business_df.state == 'fl']