## Neighborhood Venues Analysis for Café & Bar Crawls
IBM Data Science Capstone Project

#### Anthony Canterbury
September 29th, 2019

#### Packages:

In [1]:
# standard data
import numpy as np
import pandas as pd
import random
import json
import requests

# ploting
import matplotlib.cm as cm
import matplotlib.colors as colors

# machine learning
from sklearn.cluster import DBSCAN

# mapping
from geopy.distance import great_circle
from geopy.geocoders import Nominatim
import folium

#### Parameters:

In [2]:
# boundaries
MAX_PRICE = 3 # Foursquare tier for expensive
MAX_WALK = 0.8 # ~ 0.5 miles in km

# keys for filtering categories of venues
VENUE_PRIME = ['bar', 'pub', 'brewery', 'lounge']
VENUE_SECONDARY = ['caf', 'coffee', 'tea', 'desert', 'ice cream', 'donut']
VENUE_KEYS = VENUE_PRIME + VENUE_SECONDARY
FS_SECTIONS = ['drinks', 'coffee'] # these were singled out after several api checks
MIN_PRIMES = 2
MIN_SECONDARY = 2
MIN_VENUES = 5
MAX_VENUES = 7

# Times
WEEK_DAY = 6 # Saturday
START_TIME = 1500 # 3 PM
END_TIME = 2200 # 10 PM



## 1 Preliminary Data

### Get and parse the data for Cincinnati neighborhoods

In [3]:
# neighborhood geo data
nUrl = 'https://opendata.arcgis.com/datasets/572561553c9e4d618d2d7939c5261d46_0.geojson'
nJson = requests.get(nUrl).json()

# neighborhood buissness district geo data
bUrl = 'https://opendata.arcgis.com/datasets/0fa9e54fc1dd465886c3a5d980a97955_11.geojson'
bJson = requests.get(bUrl).json()

In [4]:
# get the center coordinate from matrix
def centeroidCord(arr, multi):
    x1 = arr[0][0]
    x2 = arr[0][0]
    y1 = arr[0][1]
    y2 = arr[0][1]
    if multi:
        for cord1 in arr:
            x1 = cord1[0][0]
            x2 = cord1[0][0]
            y1 = cord1[0][1]
            y2 = cord1[0][1]
            for cord in cord1:
                x1 = min(x1, cord[0])
                x2 = max(x2, cord[0])
                y1 = min(y1, cord[1])
                y2 = max(y2, cord[1])
    else:
        for cord in arr:
            x1 = min(x1, cord[0])
            x2 = max(x2, cord[0])
            y1 = min(y1, cord[1])
            y2 = max(y2, cord[1])
    return y1 + ((y2 - y1) / 2), x1 + ((x2 - x1) / 2)

# build the neighborhood dataframe from neighborhood json (nJson) and get the approximate center coordinates
neighborhoods = []
for feature in nJson['features']:
    Lat, Long = centeroidCord(feature['geometry']['coordinates'][0], feature['geometry']['type'] != 'Polygon')
    neighborhoods.append((feature['properties']['SNA_NAME'], 0, Lat, Long, feature['properties']['ACRES']))

cincyNeighsAcres = pd.DataFrame(neighborhoods)
cincyNeighsAcres.columns = ['Neighborhood', 'BusinessDistrict', 'Latitude', 'Longitude', 'Acres']

cincyNeighs = cincyNeighsAcres.drop(columns=['Acres'])

# CUF to Clifton Heights
cincyNeighs.loc[cincyNeighs.Neighborhood == 'CUF', ['Neighborhood']] = 'Clifton Heights'

# North Avondale
cincyNeighs.loc[cincyNeighs.Neighborhood == 'North Avondale - Paddock Hills', ['Neighborhood']] = 'North Avondale'

# if business district is available use it's center coordinates and create multiples
for feature in bJson['features']:
    Lat, Long = centeroidCord(feature['geometry']['coordinates'][0], feature['geometry']['type'] != 'Polygon')
    # parse for multiple neighborhood name
    districtNum = 1
    districtName = feature['properties']['NBDNAME']
    if 'OTR Vine' in districtName:
        districtName = 'Over-the-Rhine'
    if 'OTR' in districtName:
        districtName = 'Over-the-Rhine'
        districtNum = 2
    if '(A)' in districtName:
        districtName = districtName.replace(' (A)', '')
    if '(B)' in districtName:
        districtName = districtName.replace(' (B)', '')
        districtNum = 2
    if 'Hyde Park East' in districtName:
        districtName = 'Hyde Park'
    
    if districtNum == 1:
        cincyNeighs.loc[cincyNeighs.Neighborhood == districtName, ['BusinessDistrict', 'Latitude', 'Longitude']] = districtNum, Lat, Long
    else:
        cincyNeighs = cincyNeighs.append({'Neighborhood': districtName, 'BusinessDistrict': districtNum, 'Latitude': Lat, 'Longitude': Long }, ignore_index=True)

cincyNeighs.sort_values(by=['Neighborhood']).reset_index(drop=True)

Unnamed: 0,Neighborhood,BusinessDistrict,Latitude,Longitude
0,Avondale,1,39.144072,-84.499246
1,Avondale,2,39.145541,-84.491047
2,Bond Hill,1,39.176159,-84.466794
3,California,0,39.068846,-84.418962
4,Camp Washington,1,39.13679,-84.537583
5,Carthage,1,39.195554,-84.478737
6,Clifton,1,39.143023,-84.520058
7,Clifton Heights,1,39.128135,-84.517137
8,College Hill,1,39.200302,-84.546862
9,Columbia Tusculum,1,39.114023,-84.436198


### Cincinnati Neighborhood Map

In [5]:
# get the center of Cincinnati
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode('Cincinnati')
latitude = location.latitude + 0.03 # adjustment for better centering
longitude = location.longitude
print('Coordinate of Cincinnati are {}, {}.'.format(latitude, longitude))

Coordinate of Cincinnati are 39.1314537, -84.5124602.


We're going to make a lot of maps so lets make a function. Markers will be unique so we'll leave them off

In [6]:
def makeMap():
    neighMap = folium.Map(location=[latitude, longitude], zoom_start=12)

    # choropleth map without data to outline the neighborhoods    
    folium.Choropleth(
        geo_data=nJson,
        data=cincyNeighsAcres,
        columns=['Neighborhood','Acres'],
        key_on='feature.properties.SNA_NAME',
        fill_color='YlOrRd', 
        fill_opacity=0.3, 
        line_opacity=0.5,
        legend_name='Cincinnati Neighborhood Acres'
    ).add_to(neighMap)
    
    return neighMap

In [7]:
cincyNeighMap = makeMap()

# add approximate buisness center markers to map
for lat, lng, neighborhood in zip(cincyNeighs['Latitude'], cincyNeighs['Longitude'], cincyNeighs['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7
    ).add_to(cincyNeighMap)
    
# display map
cincyNeighMap

From visible inspection it's pretty good centering of neighborhood business districts.

The only outlier due to it's odd shape and lack of business districts is Riverside. We'll go ahead and remove it from the list and update the map.

*Note: Not all business districts are represented but this will help with collecting the data.*

In [8]:
cincyNeighs = cincyNeighs[cincyNeighs.Neighborhood != 'Riverside']

# again render the map, not a super easy way to remove or update markers
cincyNeighMap = makeMap()

# add approximate buisness center markers to map
for lat, lng, neighborhood in zip(cincyNeighs['Latitude'], cincyNeighs['Longitude'], cincyNeighs['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(cincyNeighMap)
    
# display map
cincyNeighMap

### Get and parse the venue data from Foursquare

In [9]:
# Foursquare Credentials and Version, hidden before upload (add your own to try)
CLIENT_ID = 'HLR23T0ZYQ21PTCNRFL2XGSBY2MFOPATO3TKR5DTEEO3JDOT'
CLIENT_SECRET = '3IBFLGAO0MOSLEKT4ZTQ2J24PWBXJSQLJTKAGLI1JADWCXGH'
VERSION = '20190928'

#### Grab max of 100 venues from a 1000 meter radius
*Note: These have been examined in detail through many iteration before these parameters were selected as optimal.*

In [None]:
# load the data from csv in order to not make too many calls
venuesDf = pd.read_csv('cincyVenues.csv')
venuesDf.drop(columns=['Unnamed: 0'], inplace = True)

In [21]:
print(venuesDf.shape)
venuesDf.head(20)

(685, 9)


Unnamed: 0,Neighborhood,BusinessDistrict,NeighborhoodLatitude,NeighborhoodLongitude,VenueName,VenuId,VenueLatitude,VenueLongitude,VenueCategory
0,Linwood,0,39.104213,-84.415924,Dennert H Distribtg,4f32494419836c91c7c8b7f7,39.108777,-84.421232,Wine Bar
1,East Walnut Hills,1,39.128889,-84.476823,The Woodburn Brewery & Taproom,55461bf6498eac118325e62e,39.12903,-84.476892,Beer Bar
2,East Walnut Hills,1,39.128889,-84.476823,Myrtle's Punch House,5473d783498ec0bbca9021d6,39.124276,-84.47613,Cocktail Bar
3,East Walnut Hills,1,39.128889,-84.476823,The Growler House,545d54ab498ea427d9af9d2d,39.129763,-84.477778,Bar
4,East Walnut Hills,1,39.128889,-84.476823,BrewRiver Gastropub,4fea02ede5e8dfeeb65b5000,39.121758,-84.475027,Gastropub
5,East Walnut Hills,1,39.128889,-84.476823,The Skunk Lounge,5182cdbd498e1c1b38b47f1c,39.124213,-84.476246,Lounge
6,East Walnut Hills,1,39.128889,-84.476823,Cliche,5d6459abca17630008abf539,39.12382,-84.47704,Bar
7,Queensgate,0,39.108472,-84.533758,City West Brewing Company,580ceb4a38faa26bf32db135,39.108208,-84.525736,Brewery
8,Queensgate,0,39.108472,-84.533758,The Playhouse,4e9a404977c807974bd69725,39.106017,-84.541503,Bar
9,Queensgate,0,39.108472,-84.533758,Royal Imports,4f3246f419836c91c7c7cd1e,39.102755,-84.526398,Wine Bar


#### Cleanup the venues
- Remove duplicates
- Remove categories that slipped through

In [23]:
# there should be quite a few duplicates
cincyVenues = venuesDf.drop_duplicates(subset=['VenueId'])
print(cincyVenues.shape)

# unique categories
cincyVenues['VenueCategory'].unique().tolist()

(395, 9)


['Wine Bar',
 'Beer Bar',
 'Cocktail Bar',
 'Bar',
 'Gastropub',
 'Lounge',
 'Brewery',
 'Whisky Bar',
 'Dive Bar',
 'Pub',
 'Sports Bar',
 'Karaoke Bar',
 'Gay Bar',
 'Ice Cream Shop',
 'Coffee Shop',
 'Hookah Bar',
 'Hotel Bar',
 'Steakhouse',
 'Café',
 'Donut Shop',
 'Tea Room',
 'Bubble Tea Shop',
 'College Cafeteria']

Most of these will fit but we don't see people wanting to gather at cafeterias or barbershops (well maybe some much older people).

In [24]:
# remove cafeterias
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'Cafeteria']
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'College Cafeteria']

# remove restaurants
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'Steakhouse']
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'Gastropub']

print(cincyVenues.shape)
cincyVenues['VenueCategory'].unique().tolist()

(381, 9)


['Wine Bar',
 'Beer Bar',
 'Cocktail Bar',
 'Bar',
 'Lounge',
 'Brewery',
 'Whisky Bar',
 'Dive Bar',
 'Pub',
 'Sports Bar',
 'Karaoke Bar',
 'Gay Bar',
 'Ice Cream Shop',
 'Coffee Shop',
 'Hookah Bar',
 'Hotel Bar',
 'Café',
 'Donut Shop',
 'Tea Room',
 'Bubble Tea Shop']

### Map the venues

In [26]:
venueMap = makeMap()

# add approximate buisness center markers to map
for lat, lng, venue in zip(cincyVenues['VenueLatitude'], cincyVenues['VenueLongitude'], cincyVenues['VenueName']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(venueMap)
    
# display map
venueMap

### Clustering our venues by location

In order to group and restrict venues down to valid clusters we'll be using DBSCAN (Density-Based Spatial Clustering of Applications with Noise).

In [27]:
# pull venue coordinates into a matrix
coords = cincyVenues[['VenueLatitude', 'VenueLongitude']].to_numpy()

kmsPerRadian = 6371.0088
epsilon = MAX_WALK / kmsPerRadian # in km radians

cvDb = DBSCAN(eps=epsilon, min_samples=MIN_VENUES, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
clusterLabels = cvDb.labels_
numClusters = len(set(clusterLabels))
clusters = pd.Series([coords[clusterLabels == n] for n in range(numClusters)])
print('Number of clusters: {}'.format(numClusters))
set(clusterLabels)

Number of clusters: 16


{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}

In [28]:
# combine the cluster data with cincyVenues
cincyVenues['DbCluster'] = clusterLabels
cincyVenues.head(10)

Unnamed: 0,Neighborhood,BusinessDistrict,NeighborhoodLatitude,NeighborhoodLongitude,VenueName,VenuId,VenueLatitude,VenueLongitude,VenueCategory,DbCluster
0,Linwood,0,39.104213,-84.415924,Dennert H Distribtg,4f32494419836c91c7c8b7f7,39.108777,-84.421232,Wine Bar,-1
1,East Walnut Hills,1,39.128889,-84.476823,The Woodburn Brewery & Taproom,55461bf6498eac118325e62e,39.12903,-84.476892,Beer Bar,0
2,East Walnut Hills,1,39.128889,-84.476823,Myrtle's Punch House,5473d783498ec0bbca9021d6,39.124276,-84.47613,Cocktail Bar,0
3,East Walnut Hills,1,39.128889,-84.476823,The Growler House,545d54ab498ea427d9af9d2d,39.129763,-84.477778,Bar,0
5,East Walnut Hills,1,39.128889,-84.476823,The Skunk Lounge,5182cdbd498e1c1b38b47f1c,39.124213,-84.476246,Lounge,0
6,East Walnut Hills,1,39.128889,-84.476823,Cliche,5d6459abca17630008abf539,39.12382,-84.47704,Bar,0
7,Queensgate,0,39.108472,-84.533758,City West Brewing Company,580ceb4a38faa26bf32db135,39.108208,-84.525736,Brewery,1
8,Queensgate,0,39.108472,-84.533758,The Playhouse,4e9a404977c807974bd69725,39.106017,-84.541503,Bar,-1
9,Queensgate,0,39.108472,-84.533758,Royal Imports,4f3246f419836c91c7c7cd1e,39.102755,-84.526398,Wine Bar,1
10,Mt. Washington,1,39.093074,-84.387447,London Bridge,4c3942f22c8020a1fe7e8c00,39.092869,-84.387158,Bar,-1


#### Visualize the clusters

In [29]:
clusterMap = makeMap()

colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(numClusters)]

# add venues with cluster colors
for lat, lng, venue, clustN in zip(cincyVenues['VenueLatitude'], cincyVenues['VenueLongitude'], cincyVenues['VenueName'], cincyVenues['DbCluster']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    
    # red around blue for outliers otherwise random colors
    c=('blue' if clustN == -1 else colors[np.int(clustN)])
    fc=('red' if clustN == -1 else colors[np.int(clustN)])
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=fc,
        fill=True,
        fill_color=c,
        fill_opacity=0.8).add_to(clusterMap)
    
# display map
clusterMap

The red around the blue circles mark the outliers, venues more than a half mile away from any other venue. Outliers are of course bad spots for the event so we should remove them from our dataset.

In [30]:
# remove salons
cincyVenues = cincyVenues[cincyVenues.DbCluster != -1]

print('We are now down to {}'.format(cincyVenues.shape[0])+' venues! And {}'.format(numClusters -1)+(' neighborhood sections to pick from.'))

We are now down to 325 venues! And 15 neighborhood sections to pick from.


#### And map again

In [31]:
clusterMap = makeMap()

colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(numClusters - 1)]

# add venues with cluster colors
for lat, lng, venue, clustN in zip(cincyVenues['VenueLatitude'], cincyVenues['VenueLongitude'], cincyVenues['VenueName'], cincyVenues['DbCluster']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    
    c=colors[np.int(clustN)]
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='grey',
        fill=True,
        fill_color=c,
        fill_opacity=0.8).add_to(clusterMap)
    
# display map
clusterMap

## 2 Recommendation System

### Pull in venue details from Foursquare
- Venue Hours to collect days and hours. Then filter.
- Venue Details to collect price and rating

In [93]:
# load the data from csv in order to not make too many calls
venuesHours = pd.read_csv('venueHours.csv')
venuesHours.drop(columns=['Unnamed: 0'], inplace = True)

In [94]:
print(venuesHours.shape)
venuesHours.head()

(51, 4)


Unnamed: 0,VenueId,DayOf,Start,End
0,4fea02ede5e8dfeeb65b5000,True,1100.0,2300.0
1,4b4f3d05f964a52060fe26e3,True,500.0,100.0
2,4b7c72d2f964a5200c942fe3,True,1100.0,1400.0
3,548a3ae3498e96c3ac6dbf16,True,1100.0,0.0
4,537f99ee498e2e6e70de2c35,True,1100.0,2000.0


Not the greatest sampling of hours :( But we can use machine learning to fill in the blanks. To improve this we could in the future pull more hours from various datasets (e.g. Yelp Api).

#### Merge, format, and predict missing times
This works because it's reasonable to assume that similar venue categories in similar areas will have similar hours. Let's test this assumtion first though.

We will switch hours to our required range in order to normalize.

In [None]:
# normalize data
# loop through venues to build new df and grabbing fields that match the ID
#   convert venue cats to array index of venue cats, db cluster is kept as is, DayOf with Start and End will be converted to open or closed or partial for y val
Feature = cincyVenues[['VenueId','VenueCategory','DbCluster']]

In [None]:
# predict missing times with ??? in vacenity

In [98]:
# remove closed venues


#### Pull details for open venues

### Predict missing data

In [None]:
# normalize rest of the data

In [None]:
# predict missing ratings

In [None]:
# predict missing price

In [None]:
# remove prices not in range

### Cluster new data

### Analyze the clusters

### Build recommendation ranking

## Main Event
#### Crawling Sponsors or Traveling Salesman