## Neighborhood Venues Analysis for Café & Bar Crawls
IBM Data Science Capstone Project

#### Anthony Canterbury
September 29th, 2019

#### Packages:

In [1]:
# standard data
import numpy as np
import pandas as pd
import random
import json
import requests

# ploting
import matplotlib.cm as cm
import matplotlib.colors as colors

# machine learning
from sklearn.cluster import DBSCAN

# mapping
from geopy.distance import great_circle
from geopy.geocoders import Nominatim
import folium

#### Parameters:

In [2]:
# boundaries
MAX_PRICE = 3 # Foursquare tier for expensive
MAX_WALK = 0.8 # ~ 0.5 miles in km

# keys for filtering categories of venues
VENUE_PRIME = ['bar', 'pub', 'brewery']
VENUE_SECONDARY = ['caf', 'coffee', 'tea', 'desert', 'ice cream', 'donut', 'pastry']
VENUE_KEYS = VENUE_PRIME + VENUE_SECONDARY
MIN_PRIMES = 2
MIN_SECONDARY = 2
MAX_VENUES = 7

# Times
WEEK_DAY = 6 # Saturday
START_TIME = 1500 # 3 PM
END_TIME = 2200 # 10 PM



## 1 Preliminary Data

### Get and parse the data for Cincinnati neighborhoods

In [3]:
# neighborhood geo data
nUrl = 'https://opendata.arcgis.com/datasets/572561553c9e4d618d2d7939c5261d46_0.geojson'
nJson = requests.get(nUrl).json()

# neighborhood buissness district geo data
bUrl = 'https://opendata.arcgis.com/datasets/0fa9e54fc1dd465886c3a5d980a97955_11.geojson'
bJson = requests.get(bUrl).json()

In [4]:
# get the center coordinate from matrix
def centeroidCord(arr, multi):
    x1 = arr[0][0]
    x2 = arr[0][0]
    y1 = arr[0][1]
    y2 = arr[0][1]
    if multi:
        for cord1 in arr:
            x1 = cord1[0][0]
            x2 = cord1[0][0]
            y1 = cord1[0][1]
            y2 = cord1[0][1]
            for cord in cord1:
                x1 = min(x1, cord[0])
                x2 = max(x2, cord[0])
                y1 = min(y1, cord[1])
                y2 = max(y2, cord[1])
    else:
        for cord in arr:
            x1 = min(x1, cord[0])
            x2 = max(x2, cord[0])
            y1 = min(y1, cord[1])
            y2 = max(y2, cord[1])
    return y1 + ((y2 - y1) / 2), x1 + ((x2 - x1) / 2)

# build the neighborhood dataframe from neighborhood json (nJson) and get the approximate center coordinates
neighborhoods = []
for feature in nJson['features']:
    Lat, Long = centeroidCord(feature['geometry']['coordinates'][0], feature['geometry']['type'] != 'Polygon')
    neighborhoods.append((feature['properties']['SNA_NAME'], 0, Lat, Long))

cincyNeighs = pd.DataFrame(neighborhoods)
cincyNeighs.columns = ['Neighborhood', 'BusinessDistrict', 'Latitude', 'Longitude']

# CUF to Clifton Heights
cincyNeighs.loc[cincyNeighs.Neighborhood == 'CUF', ['Neighborhood']] = 'Clifton Heights'

# North Avondale
cincyNeighs.loc[cincyNeighs.Neighborhood == 'North Avondale - Paddock Hills', ['Neighborhood']] = 'North Avondale'

# if business district is available use it's center coordinates and create multiples
for feature in bJson['features']:
    Lat, Long = centeroidCord(feature['geometry']['coordinates'][0], feature['geometry']['type'] != 'Polygon')
    # parse for multiple neighborhood name
    districtNum = 1
    districtName = feature['properties']['NBDNAME']
    if 'OTR Vine' in districtName:
        districtName = 'Over-the-Rhine'
    if 'OTR' in districtName:
        districtName = 'Over-the-Rhine'
        districtNum = 2
    if '(A)' in districtName:
        districtName = districtName.replace(' (A)', '')
    if '(B)' in districtName:
        districtName = districtName.replace(' (B)', '')
        districtNum = 2
    if 'Hyde Park East' in districtName:
        districtName = 'Hyde Park'
    
    if districtNum == 1:
        cincyNeighs.loc[cincyNeighs.Neighborhood == districtName, ['BusinessDistrict', 'Latitude', 'Longitude']] = districtNum, Lat, Long
    else:
        cincyNeighs = cincyNeighs.append({'Neighborhood': districtName, 'BusinessDistrict': districtNum, 'Latitude': Lat, 'Longitude': Long }, ignore_index=True)

cincyNeighs.sort_values(by=['Neighborhood']).reset_index(drop=True)

Unnamed: 0,Neighborhood,BusinessDistrict,Latitude,Longitude
0,Avondale,1,39.144072,-84.499246
1,Avondale,2,39.145541,-84.491047
2,Bond Hill,1,39.176159,-84.466794
3,California,0,39.068846,-84.418962
4,Camp Washington,1,39.13679,-84.537583
5,Carthage,1,39.195554,-84.478737
6,Clifton,1,39.143023,-84.520058
7,Clifton Heights,1,39.128135,-84.517137
8,College Hill,1,39.200302,-84.546862
9,Columbia Tusculum,1,39.114023,-84.436198


### Cincinnati Neighborhood Map

In [25]:
# get the center of Cincinnati
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode('Cincinnati')
latitude = location.latitude + 0.03 # adjustment for better centering
longitude = location.longitude
print('Coordinate of Cincinnati are {}, {}.'.format(latitude, longitude))

Coordinate of Cincinnati are 39.1314537, -84.5124602.


In [27]:
cincyNeighMap = folium.Map(location=[latitude, longitude], zoom_start=12)

# choropleth map without data to outline the neighborhoods    
folium.Choropleth(
    geo_data=nJson,
    key_on='feature.properties.SNA_NAME',
    fill_color='YlOrRd', 
    fill_opacity=0.1, 
    line_opacity=0.5,
    legend_name='Cincinnati Neighborhoods'
).add_to(cincyNeighMap)

# add approximate buisness center markers to map
for lat, lng, neighborhood in zip(cincyNeighs['Latitude'], cincyNeighs['Longitude'], cincyNeighs['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(cincyNeighMap)
    
# display map
cincyNeighMap

From visible inspection it's pretty good centering of neighborhood business districts.

The only outlier due to it's odd shape and lack of business districts is Riverside. We'll go ahead and remove it from the list and update the map.

*Note: Not all business districts are represented but this will help with collecting the data.*

In [7]:
cincyNeighs = cincyNeighs[cincyNeighs.Neighborhood != 'Riverside']

# again render the map, not a super easy way to remove or update markers
cincyNeighMap = folium.Map(location=[latitude, longitude], zoom_start=12)

# choropleth map without data to outline the neighborhoods    
folium.Choropleth(
    geo_data=nJson,
    key_on='feature.properties.SNA_NAME',
    fill_color='YlOrRd', 
    fill_opacity=0.1, 
    line_opacity=0.5,
    legend_name='Cincinnati Neighborhoods'
).add_to(cincyNeighMap)

# add approximate buisness center markers to map
for lat, lng, neighborhood in zip(cincyNeighs['Latitude'], cincyNeighs['Longitude'], cincyNeighs['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(cincyNeighMap)
    
# display map
cincyNeighMap

### Get and parse the venue data from Foursquare

In [8]:
# Foursquare Credentials and Version, hidden before upload (add your own to try)
CLIENT_ID = 'HLR23T0ZYQ21PTCNRFL2XGSBY2MFOPATO3TKR5DTEEO3JDOT'
CLIENT_SECRET = '3IBFLGAO0MOSLEKT4ZTQ2J24PWBXJSQLJTKAGLI1JADWCXGH'
VERSION = '20190928'

#### Grab max of 100 venues from a 1000 meter radius
*Note: It matters when this is run because Foursquare will return businesses it believes are open!*

In [9]:
# load the data from csv in order to not make too many calls
venuesDf = pd.read_csv('cincyVenues.csv')
venuesDf.drop(columns=['Unnamed: 0'], inplace = True)

In [10]:
# convert the venues list into a new DataFrame
#venuesDf = pd.DataFrame(venues)

# define the column names
#venuesDf.columns = ['Neighborhood', 'BusinessDistrict', 'NeighborhoodLatitude', 'NeighborhoodLongitude', 'VenueName', 
#                    'VenuId', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venuesDf.shape)
venuesDf.head(20)

(738, 8)


Unnamed: 0,Neighborhood,NeighborhoodLatitude,NeighborhoodLongitude,VenueName,VenuId,VenueLatitude,VenueLongitude,VenueCategory
0,Linwood,39.104213,-84.415924,Streetside Brewery,57e6fea3498e425ed4b9ccd6,39.109193,-84.432947,Brewery
1,Linwood,39.104213,-84.415924,Luckman Coffee Company,4b54e326f964a5206dd127e3,39.106897,-84.398441,Coffee Shop
2,Linwood,39.104213,-84.415924,United Dairy Farmers (UDF),4b91e411f964a520e9dd33e3,39.101715,-84.43214,Ice Cream Shop
3,Linwood,39.104213,-84.415924,Starbucks,4b55ce19f964a520d8f027e3,39.106955,-84.39678,Coffee Shop
4,Linwood,39.104213,-84.415924,Great Clips,4b50b272f964a520092e27e3,39.107009,-84.397995,Salon / Barbershop
5,Linwood,39.104213,-84.415924,Bad Tom Brewing,4eff2c20490182a1a702d379,39.119767,-84.418326,Brewery
6,East Walnut Hills,39.128889,-84.476823,The Woodburn Brewery & Taproom,55461bf6498eac118325e62e,39.12903,-84.476892,Beer Bar
7,East Walnut Hills,39.128889,-84.476823,Myrtle's Punch House,5473d783498ec0bbca9021d6,39.124276,-84.47613,Cocktail Bar
8,East Walnut Hills,39.128889,-84.476823,The Growler House,545d54ab498ea427d9af9d2d,39.129763,-84.477778,Bar
9,East Walnut Hills,39.128889,-84.476823,Cafe Desales,4f48087ee4b01863529f8568,39.129781,-84.476817,Café


#### Cleanup the venues
- Remove duplicates
- Remove categories that slipped through

In [11]:
# there should be quite a few duplicates
cincyVenues = venuesDf.drop_duplicates(subset=['VenuId'])
print(cincyVenues.shape)

# unique categories
cincyVenues['VenueCategory'].unique().tolist()

(268, 8)


['Brewery',
 'Coffee Shop',
 'Ice Cream Shop',
 'Salon / Barbershop',
 'Beer Bar',
 'Cocktail Bar',
 'Bar',
 'Café',
 'Gastropub',
 'Pub',
 'Donut Shop',
 'Wine Bar',
 'Bubble Tea Shop',
 'Steakhouse',
 'Gay Bar',
 'Juice Bar',
 'Whisky Bar',
 'Dive Bar',
 'Public Art',
 'Cafeteria',
 'Hotel Bar',
 'Sports Bar',
 'Karaoke Bar',
 'Hookah Bar',
 'College Cafeteria',
 'Tea Room',
 'Irish Pub']

Most of these will fit but we don't see people wanting to gather at cafeterias or barbershops (well maybe some much older people).

In [12]:
# remove cafeterias
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'Cafeteria']
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'College Cafeteria']

# remove salons
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'Salon / Barbershop']

cincyVenues.shape

(260, 8)

### Map the venues

In [28]:
venueMap = folium.Map(location=[latitude, longitude], zoom_start=12)

# choropleth map without data to outline the neighborhoods    
folium.Choropleth(
    geo_data=nJson,
    key_on='feature.properties.SNA_NAME',
    fill_color='YlOrRd', 
    fill_opacity=0.1, 
    line_opacity=0.5,
    legend_name='Cincinnati Neighborhood Venues'
).add_to(venueMap)

# add approximate buisness center markers to map
for lat, lng, venue in zip(cincyVenues['VenueLatitude'], cincyVenues['VenueLongitude'], cincyVenues['VenueName']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='orange',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(venueMap)
    
# display map
venueMap

### Clustering our venues by location

In order to group and restrict venues down to valid clusters we'll be using DBSCAN (Density-Based Spatial Clustering of Applications with Noise).

In [14]:
# pull venue coordinates into a matrix
coords = cincyVenues[['VenueLatitude', 'VenueLongitude']].to_numpy()

kmsPerRadian = 6371.0088
epsilon = MAX_WALK / kmsPerRadian # little less than 0.5 miles in km radians

cvDb = DBSCAN(eps=epsilon, min_samples=4, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
clusterLabels = cvDb.labels_
numClusters = len(set(clusterLabels))
clusters = pd.Series([coords[clusterLabels == n] for n in range(numClusters)])
print('Number of clusters: {}'.format(numClusters))
set(clusterLabels)

Number of clusters: 12


{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

In [15]:
# combine the cluster data with cincyVenues
cincyVenues['DbCluster'] = clusterLabels
cincyVenues.head(10)

Unnamed: 0,Neighborhood,NeighborhoodLatitude,NeighborhoodLongitude,VenueName,VenuId,VenueLatitude,VenueLongitude,VenueCategory,DbCluster
0,Linwood,39.104213,-84.415924,Streetside Brewery,57e6fea3498e425ed4b9ccd6,39.109193,-84.432947,Brewery,7
1,Linwood,39.104213,-84.415924,Luckman Coffee Company,4b54e326f964a5206dd127e3,39.106897,-84.398441,Coffee Shop,-1
2,Linwood,39.104213,-84.415924,United Dairy Farmers (UDF),4b91e411f964a520e9dd33e3,39.101715,-84.43214,Ice Cream Shop,-1
3,Linwood,39.104213,-84.415924,Starbucks,4b55ce19f964a520d8f027e3,39.106955,-84.39678,Coffee Shop,-1
5,Linwood,39.104213,-84.415924,Bad Tom Brewing,4eff2c20490182a1a702d379,39.119767,-84.418326,Brewery,-1
6,East Walnut Hills,39.128889,-84.476823,The Woodburn Brewery & Taproom,55461bf6498eac118325e62e,39.12903,-84.476892,Beer Bar,0
7,East Walnut Hills,39.128889,-84.476823,Myrtle's Punch House,5473d783498ec0bbca9021d6,39.124276,-84.47613,Cocktail Bar,0
8,East Walnut Hills,39.128889,-84.476823,The Growler House,545d54ab498ea427d9af9d2d,39.129763,-84.477778,Bar,0
9,East Walnut Hills,39.128889,-84.476823,Cafe Desales,4f48087ee4b01863529f8568,39.129781,-84.476817,Café,0
10,East Walnut Hills,39.128889,-84.476823,BrewRiver Gastropub,4fea02ede5e8dfeeb65b5000,39.121758,-84.475027,Gastropub,0


#### Visualize the clusters

In [29]:
clusterMap = folium.Map(location=[latitude, longitude], zoom_start=12)

# choropleth map without data to outline the neighborhoods    
folium.Choropleth(
    geo_data=nJson,
    key_on='feature.properties.SNA_NAME',
    fill_color='YlOrRd', 
    fill_opacity=0.1, 
    line_opacity=0.5,
    legend_name='Cincinnati Neighborhoods'
).add_to(clusterMap)

colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(numClusters)]

# add venues with cluster colors
for lat, lng, venue, clustN in zip(cincyVenues['VenueLatitude'], cincyVenues['VenueLongitude'], cincyVenues['VenueName'], cincyVenues['DbCluster']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    
    # red around blue for outliers otherwise random colors
    c=('blue' if clustN == -1 else colors[np.int(clustN)])
    fc=('red' if clustN == -1 else colors[np.int(clustN)])
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=fc,
        fill=True,
        fill_color=c,
        fill_opacity=0.8).add_to(clusterMap)
    
# display map
clusterMap

The red around the blue circles mark the outliers, venues more than a half mile away from any other venue. Outliers are of course bad spots for the event so we should remove them from our dataset.

In [17]:
# remove salons
cincyVenues = cincyVenues[cincyVenues.DbCluster != -1]

print('We are now down to {}'.format(cincyVenues.shape[0])+' venues! And {}'.format(numClusters -1)+(' neighborhood sections to pick from.'))

We are now down to 202 venues! And 11 neighborhood sections to pick from.


#### And map again

In [30]:
clusterMap = folium.Map(location=[latitude, longitude], zoom_start=12)

# choropleth map without data to outline the neighborhoods    
folium.Choropleth(
    geo_data=nJson,
    key_on='feature.properties.SNA_NAME',
    fill_color='YlOrRd', 
    fill_opacity=0.1, 
    line_opacity=0.5,
    legend_name='Cincinnati Neighborhood Venues'
).add_to(clusterMap)

colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(numClusters - 1)]

# add venues with cluster colors
for lat, lng, venue, clustN in zip(cincyVenues['VenueLatitude'], cincyVenues['VenueLongitude'], cincyVenues['VenueName'], cincyVenues['DbCluster']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    
    c=colors[np.int(clustN)]
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=c,
        fill=True,
        fill_color=c,
        fill_opacity=0.8).add_to(clusterMap)
    
# display map
clusterMap

## 2 Recommendation System

### Pull in venue details from Foursquare
- Venue Hours to collect days and hours. Then filter.
- Venue Details to collect price and rating

In [None]:
# load the data from csv in order to not make too many calls
venuesHours = pd.read_csv('cincyVenueHours.csv')
venuesHours.drop(columns=['Unnamed: 0'], inplace = True)

In [68]:
#venuesHours = pd.DataFrame(vHours)

# define the column names
#venuesHours.columns = ['VenueId', 'DayOf', 'Start', 'End', 'Failed']

print(venuesHours.shape)
venuesHours.head(20)

(202, 4)


Unnamed: 0,VenueId,DayOf,Start,End
0,57e6fea3498e425ed4b9ccd6,True,1000,2300
1,55461bf6498eac118325e62e,True,1200,0
2,5473d783498ec0bbca9021d6,True,1200,0
3,545d54ab498ea427d9af9d2d,True,1200,0
4,4f48087ee4b01863529f8568,True,1200,0
5,4fea02ede5e8dfeeb65b5000,True,1100,2300
6,4b6b50caf964a52056002ce3,True,1000,1800
7,4b4f3d05f964a52060fe26e3,True,500,100
8,54e8dc93498e2565b4a9f267,True,1100,0
9,51c4888e498ea0454b65c7cd,True,1200,200


#### Merge, format, and predict missing times

In [None]:
# merge times with cincyVenues


In [71]:
# normalize data

In [None]:
# predict missing times with ??? in vacenity

In [None]:
# remove closed venues

#### Pull details for open venues

In [None]:
vDetails = []

for vId, name, cluster in zip(cincyVenues['VenueId'], cincyVenues['VenueName'], cincyVenues['DbCluster']):
    url = "https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}".format(
        vId,
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION)

    result = requests.get(url).json()['response']['venue']
    
    price = None
    rating = None
    likes = None
    try:
        price = result['price']['tier']
    except:
        pass
    try:
        rating = result['rating']
    except:
        pass
    try:
        likes = result['likes']['count']
    except:
        pass
    
    
    vDetails.append((
        name,
        vId,
        cluster, 
        rating,
        likes,
        price
    ))

### Predict missing data

In [72]:
# normalize rest of the data

In [None]:
# predict missing ratings

In [None]:
# predict missing price

In [73]:
# remove prices not in range

### Cluster new data

### Analyze the clusters

### Build recommendation ranking

## Main Event
#### Crawling Sponsors or Traveling Salesman