In [1]:
%matplotlib inline
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import gmaps
gmaps.configure(api_key="AIzaSyA8dLxSuYgpBzYH6aQ3jdHUXnhCVgfdTQg")

In [2]:
def get_grid(df, bin_size=0.5, grid_size=5):
    """
    df contains 'latitude', 'longitude', 'pred_stars' columns
    bin_size is miles wide and high each bin will be
    grid_size is miles wide and high the whole grid will be
    """
    one_mile_per_long_deg = 1.0 / 54.4
    one_mile_per_lat_deg = 1.0 / 68.941
    long_bin_size = bin_size * one_mile_per_long_deg
    lat_bin_size = bin_size * one_mile_per_lat_deg
    long_min, long_max = df['longitude'].min(), df['longitude'].max()
    lat_min, lat_max = df['latitude'].min(), df['latitude'].max()
    n_long_bins = int(np.ceil((long_max-long_min)/long_bin_size)) + 1
    long_0 = long_min - 0.5 * long_bin_size
    long_edges = np.arange(n_long_bins+1) * long_bin_size + long_0
    n_lat_bins = int(np.ceil((lat_max-lat_min)/lat_bin_size)) + 1
    lat_0 = lat_min - 0.5 * lat_bin_size
    lat_edges = np.arange(n_lat_bins+1) * lat_bin_size + lat_0
    grid = np.zeros((n_long_bins, n_lat_bins))
    for ind, long, lat, stars in df[['longitude', 'latitude', 'pred_stars']].itertuples():
        i_x = int((long - long_0) / long_bin_size)
        if i_x < 0 or i_x > n_long_bins - 1:
            print('index {} has i_x={}'.format(ind, i_x))
        i_y = int((lat - lat_0) / lat_bin_size)
        if i_y < 0 or i_y > n_lat_bins - 1:
            print('index {} has i_y={}'.format(ind, i_y))
        grid[i_x, i_y] += stars
    bins_wide = np.round(grid_size / bin_size).astype(int)
    x_max, y_max = np.unravel_index(grid.argmax(), grid.shape)
    x_lo, x_hi = max(0, x_max - bins_wide//2), min(n_long_bins-1, x_max+bins_wide//2)
    y_lo, y_hi = max(0, y_max - bins_wide//2), min(n_lat_bins-1, y_max+bins_wide//2)
    grid = grid[x_lo:x_hi, y_lo:y_hi]
    bounds = ((long_min + x_lo * long_bin_size, long_min + x_hi * long_bin_size),
              (lat_min + y_lo * lat_bin_size, lat_min + y_hi * lat_bin_size))
    opt_coords = (long_min + x_max * long_bin_size, lat_min + y_max * lat_bin_size)
    return grid, bounds, opt_coords

## Start by choosing a city you want to visit. Choices are:

+ Cleveland
+ Champaign
+ Charlotte
+ Pittsburgh
+ Las Vegas
+ Montreal
+ Toronto
+ Phoenix
+ Madison

In [3]:
# Example: `CITY = 'Las Vegas'`
CITY = 'Las Vegas'

## Next, choose one or more categories of businesses you'd like to visit in this city. Choices are

+ Restaurants
+ Shopping
+ Beauty & Spas
+ Nightlife
+ Bars
+ Active Life
+ Fashion
+ Coffee & Tea
+ Arts & Entertainment
+ Hotels & Travel


In [4]:
# Example: `CATEGORIES = ['Restaurants', 'Shopping', 'Hotels & Travel']`
CATEGORIES = ['Restaurants', 'Shopping', 'Hotels & Travel']

In [5]:
df_bus = pd.read_csv('../preprocessed-data/final-businesses-by-city/' + CITY.replace(' ', '-') + '-businesses.csv')
df_bus = df_bus.set_index('business_id')
df_bus = df_bus[(df_bus[CATEGORIES] > 0).any(axis=1)]

In [6]:
df_rev = pd.read_csv('../preprocessed-data/final-reviews-by-city/' + CITY.replace(' ', '-') + '-reviews.csv')
df_rev = df_rev[df_rev['business_id'].isin(df_bus.index)]

In [7]:
df_user = pd.read_csv('../preprocessed-data/final-users-by-city/' + CITY.replace(' ', '-') + '-users.csv')
df_user = df_user[df_user['user_id'].isin(df_rev['user_id'])]
df_user = df_user.sort_values('n_reviews', ascending=False)

In [8]:
for i, user_id in enumerate(df_user['user_id'][:10]):
    print('User {} URL:  https://www.yelp.com/user_details?userid={}'.format(i, user_id))

User 0 URL:  https://www.yelp.com/user_details?userid=bLbSNkLggFnqwNNzzq-Ijw
User 1 URL:  https://www.yelp.com/user_details?userid=PKEzKWv_FktMm2mGPjwd0Q
User 2 URL:  https://www.yelp.com/user_details?userid=UYcmGbelzRa0Q6JqzLoguw
User 3 URL:  https://www.yelp.com/user_details?userid=N3oNEwh0qgPqPP3Em6wJXw
User 4 URL:  https://www.yelp.com/user_details?userid=n86B7IkbU20AkxlFX_5aew
User 5 URL:  https://www.yelp.com/user_details?userid=C2C0GPKvzWWnP57Os9eQ0w
User 6 URL:  https://www.yelp.com/user_details?userid=3nDUQBjKyVor5wV0reJChg
User 7 URL:  https://www.yelp.com/user_details?userid=tH0uKD-vNwMoEc3Xk3Cbdg
User 8 URL:  https://www.yelp.com/user_details?userid=U4INQZOPSUaj8hMjLlZ3KA
User 9 URL:  https://www.yelp.com/user_details?userid=qewG3X2O4X6JKskxyyqFwQ


# Pause!

Click some of the URLs above to different Yelp user profiles, and decide which people seem to have similar taste to you. Add those user indices to the list below. For example, if User 0, User 5, and User 9 seem to have similar taste to me, I'd put `USER_LIST = [0, 5, 9]` below.

In [9]:
USER_LIST = [0, 5, 9]

In [10]:
df_user = df_user.iloc[USER_LIST]
U = np.array(df_user['latent_1'])
B = np.array(df_bus['latent_1'])
pred_stars = U[:, None] * B[None, :]
pred_stars = pred_stars.mean(axis=0)
df_bus['pred_stars'] = pred_stars

In [11]:
df_bus_list_by_cat = [ df_bus[df_bus[category] > 0].sort_values('pred_stars', ascending=False)
                      for category in CATEGORIES ]

In [12]:
grid, bounds, opt_coords = get_grid(df_bus)

In [13]:
df_bus_on_grid = df_bus[df_bus['longitude'].between(*bounds[0]) & df_bus['latitude'].between(*bounds[1])].copy()
df_bus_on_grid['rounded_preds'] = np.round(df_bus_on_grid['pred_stars']).astype(int)
df_bus_on_block = df_bus_on_grid[df_bus_on_grid['longitude'].between(opt_coords[0]-0.25/54.4, opt_coords[0]+0.25/54.4) &
                                 df_bus_on_grid['latitude'].between( opt_coords[1]-0.25/68.941, opt_coords[1]+0.25/68.941)]
df_bus_on_block = df_bus_on_block.sort_values('pred_stars', ascending=False)
n_show = min(10, len(df_bus_on_block))
block_locations = list(zip(df_bus_on_block['latitude'][:n_show], df_bus_on_block['longitude'][:n_show]))
info_box_content = []
for i, (ind, name) in enumerate(df_bus_on_block[['name']][:n_show].itertuples()):
    info_box_content.append('{}. {}'.format(i+1, name))

In [14]:
locations = []
for ind, lat, long, rounded_preds in df_bus_on_grid[['latitude', 'longitude', 'rounded_preds']].itertuples():
    locations += [(lat, long)] * rounded_preds
m = gmaps.Map()
m.add_layer(gmaps.heatmap_layer(locations))
m.add_layer(gmaps.marker_layer(block_locations, info_box_content=info_box_content))
m

# Here are our top recommendations for you:

In [15]:
top_recs = df_bus_on_block.iloc[:n_show][['name', 'pred_stars']]
print('    BUSINESS NAME                                      PREDICTED RATING')
for i, (bus_id, name, pred_stars) in enumerate(top_recs.itertuples()):
    print('{:2}. {:50} {:.2f} stars'.format(i+1, name, pred_stars))

    BUSINESS NAME                                      PREDICTED RATING
 1. é by José Andrés                                   4.29 stars
 2. Mandarin Oriental, Las Vegas                       4.28 stars
 3. Picasso                                            4.22 stars
 4. Le Cirque                                          4.02 stars
 5. Harvest by Roy Ellamar                             4.01 stars
 6. Herringbone                                        3.97 stars
 7. Sage                                               3.92 stars
 8. Mastro's Ocean Club                                3.91 stars
 9. BARDOT Brasserie                                   3.88 stars
10. Eiffel Tower Restaurant                            3.88 stars
