https://docs.google.com/document/d/15GMxvJYAUO-b96c18QmfcF278IVVBBtCFk8nayDp9oY/edit

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os
import scipy as sp
from scipy import stats

from tools.plt import color2d #from the 'srcole/tools' repo
from matplotlib import cm
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 100

### Load dataframes

In [2]:
# Load cities info
df_cities = pd.read_csv('/gh/data2/yelp/city_pop.csv', index_col=0)
df_cities.head()

Unnamed: 0,city,state,population,total_food,latitude,longitude,total_scraped
0,New York,New York,8537673,54191,40.705445,-73.994293,1000
1,Los Angeles,California,3976322,41685,34.06159,-118.321381,1000
2,Chicago,Illinois,2704958,19315,41.905159,-87.677765,1000
3,Houston,Texas,2303482,15197,29.784854,-95.359955,1000
4,Phoenix,Arizona,1615017,11034,33.465086,-112.07016,1000


In [3]:
# Load restaurants
df_restaurants = pd.read_csv('/gh/data2/yelp/food_by_city/df_restaurants.csv', index_col=0)
df_restaurants.head()

Unnamed: 0,id,name,city,state,rating,review_count,cost,latitude,longitude,has_delivery,has_pickup,url
0,poquito-picante-brooklyn-2,Poquito Picante,New York,New York,4.5,40,2,40.685742,-73.981262,True,True,https://www.yelp.com/biz/poquito-picante-brook...
1,nourish-brooklyn-4,Nourish,New York,New York,4.0,65,2,40.67796,-73.96855,True,True,https://www.yelp.com/biz/nourish-brooklyn-4?ad...
2,taste-of-heaven-brooklyn,Taste of Heaven,New York,New York,5.0,19,2,40.71715,-73.94054,False,True,https://www.yelp.com/biz/taste-of-heaven-brook...
3,milk-and-cream-cereal-bar-new-york,Milk & Cream Cereal Bar,New York,New York,4.5,307,2,40.71958,-73.99654,False,False,https://www.yelp.com/biz/milk-and-cream-cereal...
4,the-bao-shoppe-new-york-2,The Bao Shoppe,New York,New York,4.0,99,1,40.714345,-73.990518,False,False,https://www.yelp.com/biz/the-bao-shoppe-new-yo...


In [4]:
# Load categories by restaurant
df_categories = pd.read_pickle('/gh/data2/yelp/food_by_city/df_categories_sparse.pkl')
df_categories.head()

Unnamed: 0,acaibowls,accessories,active,acupuncture,adultedu,advertising,aerialfitness,afghani,african,airport_shuttles,airportlounges,amateursportsteams,amusementparks,animalshelters,antiques,apartments,appliances,aquariums,arabian,arcades,archery,argentine,armenian,artclasses,artmuseums,arts,artsandcrafts,artschools,artsupplies,arttours,asianfusion,attractionfarms,auctionhouses,australian,austrian,auto,auto_detailing,autocustomization,autopartssupplies,autorepair,ayurveda,baby_gear,bagels,bakeries,bangladeshi,banks,barbers,barcrawl,bars,bartenders,...,truckrepair,turkish,tuscan,ukrainian,university_housing,unofficialyelpevents,usedbooks,uzbek,vacation_rentals,vapeshops,vegan,vegetarian,venezuelan,venues,vermouthbars,vet,videoandgames,videofilmproductions,videogamestores,vietnamese,vintage,vinyl_records,virtualrealitycenters,visitorcenters,vitaminssupplements,waffles,walkingtours,watches,waterdelivery,waterparks,waterpurification,waterstores,waxing,web_design,wedding_planning,weddingchappels,weightlosscenters,whiskeybars,wholesale_stores,wholesalers,wine_bars,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# These are used for the 'category' input to the search function
df_categories_info = pd.read_json('/gh/data2/yelp/categories.json')
df_categories_info.head()

Unnamed: 0,alias,country_blacklist,country_whitelist,parents,title
0,3dprinting,,,[localservices],3D Printing
1,abruzzese,,[IT],[italian],Abruzzese
2,absinthebars,,[CZ],[bars],Absinthe Bars
3,acaibowls,"[AR, PL, TR, MX, CL, IT]",,[food],Acai Bowls
4,accessories,,,[fashion],Accessories


# Cuisines by city

In [6]:
# New dataframe: For each cuisine, compute the average rating, average price, and # restaurants
all_cuisines = df_categories.keys()
cuisine_dict = {'cuisine': [],
                'avg_rating': [],
                'avg_cost': [],
                'N': []}
for k in all_cuisines:
    df_temp = df_restaurants[df_categories[k]==1]
    cuisine_dict['cuisine'].append(k)
    cuisine_dict['avg_rating'].append(df_temp['rating'].mean())
    cuisine_dict['avg_cost'].append(df_temp['cost'].mean())
    cuisine_dict['N'].append(len(df_temp))
df_cuisine = pd.DataFrame.from_dict(cuisine_dict)

# Determine cuisines of interest
# Only look at cuisines with at least 2000 restaurants
min_N = 1000
category_counts = df_categories.sum()
categories_keep = category_counts[category_counts > min_N]
cuisines_rmv = ['bars', 'beer_and_wine', 'beerbar', 'breweries', 'butcher', 'cafes', 'catering',
                'chickenshop', 'cocktailbars', 'convenience', 'cosmetics', 'customcakes',
                'deptstores', 'divebars', 'drugstores', 'eventplanning', 'farmersmarket', 'fooddeliveryservices',
                'foodstands', 'gastropubs', 'gourmet', 'grocery', 'healthmarkets', 'importedfood', 'intlgrocery',
                'karaoke', 'lounges', 'markets', 'meats', 'musicvenues', 'personalchefs', 'pubs',
                'restaurants', 'salvadoran', 'seafoodmarkets', 'servicestations', 'sportsbars', 'streetvendors',
                'tapasmallplates', 'venues', 'wine_bars', 'wineries']
categories_keep.drop(cuisines_rmv, inplace=True)
categories_keep = categories_keep.keys()

In [9]:
df_categories.loc[:,categories_keep]

KeyboardInterrupt: 

# IN THIS NOTEBOOK AND OTHERS, NEED TO FIX HOW USE DF_CAT IN BELOW DF NOW THAT ITS SPARSE

In [None]:
# Set up dataframe for restaurants with categories of interest
restaurant_have_category = df_categories.loc[:,categories_keep].sum(axis=1).to_dict()
df_restaurants_keep_idx = [k for k in restaurant_have_category.keys() if restaurant_have_category[k]]
df_restaurants_temp = df_restaurants.loc[df_restaurants_keep_idx].reset_index(drop=True)
df_categories_temp = df_categories.loc[df_restaurants_keep_idx,categories_keep].reset_index(drop=True)
df_restaurants_temp = df_restaurants_temp.merge(df_categories_temp, left_index=True, right_index=True)

In [None]:
# Compute fraction of each cuisine by city
df_city_cuisines = df_restaurants_temp.groupby('city').mean()
df_state_cuisines = df_restaurants_temp.groupby('state').mean()

# Explore features by city
* rating, review_count, cost, has_delivery, has_pickup
* each cuisine

In [None]:
df_city_cuisines.head(5)

# Highest average rating
* Highest average rating are the most popular cities because yelp will return the top ones in each city

In [None]:
df_city_cuisines.sort_values('rating', ascending=False, inplace=True)

N=60
plt.figure(figsize=(30,5))
plt.bar(np.arange(N), df_city_cuisines['rating'].values[:N], color='k', ecolor='.5')
plt.xticks(np.arange(N), df_city_cuisines.index[:N])
plt.ylabel('Average rating', size=20)
plt.xlabel('City', size=20)
plt.xticks(size=15, rotation='vertical')
# plt.yticks([10**3, 10**4, 10**5], size=15)
plt.ylim((3.5, 4.5))
plt.xlim((-1, N))

# boba

In [None]:
c = 'bubbletea'
df_city_cuisines.sort_values(c, ascending=False, inplace=True)

N=60
plt.figure(figsize=(30,5))
plt.bar(np.arange(N), df_city_cuisines[c].values[:N], color='k', ecolor='.5')
plt.xticks(np.arange(N), df_city_cuisines.index[:N])
plt.ylabel('Fraction of restaurants are\n'+c, size=20)
plt.xlabel('City', size=20)
plt.xticks(size=15, rotation='vertical')
plt.xlim((-1, N))

In [None]:
c = 'mexican'
df_state_cuisines.sort_values(c, ascending=False, inplace=True)

plt.figure(figsize=(30,5))
plt.bar(np.arange(len(df_state_cuisines)), df_state_cuisines[c].values, color='k', ecolor='.5')
plt.xticks(np.arange(len(df_state_cuisines)), df_state_cuisines.index)
plt.ylabel('Fraction of restaurants are\n'+c, size=20)
plt.xlabel('State', size=20)
plt.xticks(size=15, rotation='vertical')
plt.xlim((-1, len(df_state_cuisines)))

In [None]:
c = 'italian'
df_state_cuisines.sort_values(c, ascending=False, inplace=True)

plt.figure(figsize=(30,5))
plt.bar(np.arange(len(df_state_cuisines)), df_state_cuisines[c].values, color='k', ecolor='.5')
plt.xticks(np.arange(len(df_state_cuisines)), df_state_cuisines.index)
plt.ylabel('Fraction of restaurants are\n'+c, size=20)
plt.xlabel('State', size=20)
plt.xticks(size=15, rotation='vertical')
plt.xlim((-1, len(df_state_cuisines)))