## Neighborhood Venues Analysis for Café & Bar Crawls
IBM Data Science Capstone Project

#### Anthony Canterbury
September 29th, 2019

#### Packages:

In [1]:
# standard data
import numpy as np
import pandas as pd
import random
import json
import requests

# ploting
import matplotlib.cm as cm
import matplotlib.colors as colors

# machine learning
from sklearn.cluster import DBSCAN
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score

# mapping
from geopy.geocoders import Nominatim
import folium

#### Parameters:

In [2]:
# boundaries
MAX_PRICE = 3 # Foursquare tier for expensive
MAX_WALK = 0.8 # ~ 0.5 miles in km

# keys for filtering categories of venues
VENUE_PRIME = ['bar', 'pub', 'brewery', 'lounge']
VENUE_SECONDARY = ['caf', 'coffee', 'tea', 'desert', 'ice cream', 'donut']
VENUE_KEYS = VENUE_PRIME + VENUE_SECONDARY
FS_SECTIONS = ['drinks', 'coffee'] # these were singled out after several api checks
MIN_PRIMES = 2
MIN_SECONDARY = 2
MIN_VENUES = 5
MAX_VENUES = 7

# Times
WEEK_DAY = 6 # Saturday
START_TIME = 1500 # 3 PM
END_TIME = 2200 # 10 PM



## 1 Preliminary Data

### Get and parse the data for Cincinnati neighborhoods

In [3]:
# neighborhood geo data
nUrl = 'https://opendata.arcgis.com/datasets/572561553c9e4d618d2d7939c5261d46_0.geojson'
nJson = requests.get(nUrl).json()

# neighborhood buissness district geo data
bUrl = 'https://opendata.arcgis.com/datasets/0fa9e54fc1dd465886c3a5d980a97955_11.geojson'
bJson = requests.get(bUrl).json()

In [4]:
# get the center coordinate from matrix
def centeroidCord(arr, multi):
    x1 = arr[0][0]
    x2 = arr[0][0]
    y1 = arr[0][1]
    y2 = arr[0][1]
    if multi:
        for cord1 in arr:
            x1 = cord1[0][0]
            x2 = cord1[0][0]
            y1 = cord1[0][1]
            y2 = cord1[0][1]
            for cord in cord1:
                x1 = min(x1, cord[0])
                x2 = max(x2, cord[0])
                y1 = min(y1, cord[1])
                y2 = max(y2, cord[1])
    else:
        for cord in arr:
            x1 = min(x1, cord[0])
            x2 = max(x2, cord[0])
            y1 = min(y1, cord[1])
            y2 = max(y2, cord[1])
    return y1 + ((y2 - y1) / 2), x1 + ((x2 - x1) / 2)

# build the neighborhood dataframe from neighborhood json (nJson) and get the approximate center coordinates
neighborhoods = []
for feature in nJson['features']:
    Lat, Long = centeroidCord(feature['geometry']['coordinates'][0], feature['geometry']['type'] != 'Polygon')
    neighborhoods.append((feature['properties']['SNA_NAME'], 0, Lat, Long, feature['properties']['ACRES']))

cincyNeighsAcres = pd.DataFrame(neighborhoods)
cincyNeighsAcres.columns = ['Neighborhood', 'BusinessDistrict', 'Latitude', 'Longitude', 'Acres']

# Only will use acres for pretty pictures
cincyNeighs = cincyNeighsAcres.drop(columns=['Acres'])

# CUF to Clifton Heights
cincyNeighs.loc[cincyNeighs.Neighborhood == 'CUF', ['Neighborhood']] = 'Clifton Heights'

# North Avondale
cincyNeighs.loc[cincyNeighs.Neighborhood == 'North Avondale - Paddock Hills', ['Neighborhood']] = 'North Avondale'

# if business district is available use it's center coordinates and create multiples
for feature in bJson['features']:
    Lat, Long = centeroidCord(feature['geometry']['coordinates'][0], feature['geometry']['type'] != 'Polygon')
    # parse for multiple neighborhood name
    districtNum = 1
    districtName = feature['properties']['NBDNAME']
    if 'OTR Vine' in districtName:
        districtName = 'Over-the-Rhine'
    if 'OTR' in districtName:
        districtName = 'Over-the-Rhine'
        districtNum = 2
    if '(A)' in districtName:
        districtName = districtName.replace(' (A)', '')
    if '(B)' in districtName:
        districtName = districtName.replace(' (B)', '')
        districtNum = 2
    if 'Hyde Park East' in districtName:
        districtName = 'Hyde Park'
    
    if districtNum == 1:
        cincyNeighs.loc[cincyNeighs.Neighborhood == districtName, ['BusinessDistrict', 'Latitude', 'Longitude']] = districtNum, Lat, Long
    else:
        cincyNeighs = cincyNeighs.append({'Neighborhood': districtName, 'BusinessDistrict': districtNum, 'Latitude': Lat, 'Longitude': Long }, ignore_index=True)

cincyNeighs.sort_values(by=['Neighborhood']).reset_index(drop=True)

Unnamed: 0,Neighborhood,BusinessDistrict,Latitude,Longitude
0,Avondale,1,39.144072,-84.499246
1,Avondale,2,39.145541,-84.491047
2,Bond Hill,1,39.176159,-84.466794
3,California,0,39.068846,-84.418962
4,Camp Washington,1,39.13679,-84.537583
5,Carthage,1,39.195554,-84.478737
6,Clifton,1,39.143023,-84.520058
7,Clifton Heights,1,39.128135,-84.517137
8,College Hill,1,39.200302,-84.546862
9,Columbia Tusculum,1,39.114023,-84.436198


### Cincinnati Neighborhood Map

In [5]:
# get the center of Cincinnati
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode('Cincinnati')
latitude = location.latitude + 0.03 # adjustment for better centering
longitude = location.longitude
print('Coordinate of Cincinnati are {}, {}.'.format(latitude, longitude))

Coordinate of Cincinnati are 39.1314537, -84.5124602.


We're going to make a lot of maps so lets make a function. Markers will be unique so we'll leave them off

In [6]:
def makeMap():
    neighMap = folium.Map(location=[latitude, longitude], zoom_start=12)

    # choropleth map without data to outline the neighborhoods    
    folium.Choropleth(
        geo_data=nJson,
        data=cincyNeighsAcres,
        columns=['Neighborhood','Acres'],
        key_on='feature.properties.SNA_NAME',
        fill_color='YlOrRd', 
        fill_opacity=0.3, 
        line_opacity=0.5,
        legend_name='Cincinnati Neighborhood Acres'
    ).add_to(neighMap)
    
    return neighMap

In [7]:
cincyNeighMap = makeMap()

# add approximate buisness center markers to map
for lat, lng, neighborhood in zip(cincyNeighs['Latitude'], cincyNeighs['Longitude'], cincyNeighs['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7
    ).add_to(cincyNeighMap)
    
# display map
cincyNeighMap

From visible inspection it's pretty good centering of neighborhood business districts.

The only outlier due to it's odd shape and lack of business districts is Riverside. We'll go ahead and remove it from the list and update the map.

*Note: Not all business districts are represented but this will help with collecting the data.*

In [8]:
cincyNeighs = cincyNeighs[cincyNeighs.Neighborhood != 'Riverside']

# again render the map, not a super easy way to remove or update markers
cincyNeighMap = makeMap()

# add approximate buisness center markers to map
for lat, lng, neighborhood in zip(cincyNeighs['Latitude'], cincyNeighs['Longitude'], cincyNeighs['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(cincyNeighMap)
    
# display map
cincyNeighMap

### Get and parse the venue data from Foursquare

In [9]:
# Foursquare Credentials and Version, hidden before upload (add your own to try)
CLIENT_ID = 'HLR23T0ZYQ21PTCNRFL2XGSBY2MFOPATO3TKR5DTEEO3JDOT'
CLIENT_SECRET = '3IBFLGAO0MOSLEKT4ZTQ2J24PWBXJSQLJTKAGLI1JADWCXGH'
VERSION = '20190928'

#### Grab max of 100 venues from a 1000 meter radius
*Note: These have been examined in detail through many iteration before these parameters were selected as optimal.*

In [10]:
# load the data from csv in order to not make too many calls
venuesDf = pd.read_csv('cincyVenues.csv')
venuesDf.drop(columns=['Unnamed: 0'], inplace = True)

In [11]:
print(venuesDf.shape)
venuesDf.head(20)

(685, 9)


Unnamed: 0,Neighborhood,BusinessDistrict,NeighborhoodLatitude,NeighborhoodLongitude,VenueName,VenueId,VenueLatitude,VenueLongitude,VenueCategory
0,Linwood,0,39.104213,-84.415924,Dennert H Distribtg,4f32494419836c91c7c8b7f7,39.108777,-84.421232,Wine Bar
1,East Walnut Hills,1,39.128889,-84.476823,The Woodburn Brewery & Taproom,55461bf6498eac118325e62e,39.12903,-84.476892,Beer Bar
2,East Walnut Hills,1,39.128889,-84.476823,Myrtle's Punch House,5473d783498ec0bbca9021d6,39.124276,-84.47613,Cocktail Bar
3,East Walnut Hills,1,39.128889,-84.476823,The Growler House,545d54ab498ea427d9af9d2d,39.129763,-84.477778,Bar
4,East Walnut Hills,1,39.128889,-84.476823,BrewRiver Gastropub,4fea02ede5e8dfeeb65b5000,39.121758,-84.475027,Gastropub
5,East Walnut Hills,1,39.128889,-84.476823,The Skunk Lounge,5182cdbd498e1c1b38b47f1c,39.124213,-84.476246,Lounge
6,East Walnut Hills,1,39.128889,-84.476823,Cliche,5d6459abca17630008abf539,39.12382,-84.47704,Bar
7,Queensgate,0,39.108472,-84.533758,City West Brewing Company,580ceb4a38faa26bf32db135,39.108208,-84.525736,Brewery
8,Queensgate,0,39.108472,-84.533758,The Playhouse,4e9a404977c807974bd69725,39.106017,-84.541503,Bar
9,Queensgate,0,39.108472,-84.533758,Royal Imports,4f3246f419836c91c7c7cd1e,39.102755,-84.526398,Wine Bar


#### Cleanup the venues
- Remove duplicates
- Remove categories that slipped through

In [12]:
# there should be quite a few duplicates
cincyVenues = venuesDf.drop_duplicates(subset=['VenueId'])
print(cincyVenues.shape)

# unique categories
cincyVenues['VenueCategory'].unique().tolist()

(395, 9)


['Wine Bar',
 'Beer Bar',
 'Cocktail Bar',
 'Bar',
 'Gastropub',
 'Lounge',
 'Brewery',
 'Whisky Bar',
 'Dive Bar',
 'Pub',
 'Sports Bar',
 'Karaoke Bar',
 'Gay Bar',
 'Ice Cream Shop',
 'Coffee Shop',
 'Hookah Bar',
 'Hotel Bar',
 'Steakhouse',
 'Café',
 'Donut Shop',
 'Tea Room',
 'Bubble Tea Shop',
 'College Cafeteria']

Most of these will fit but we don't see people wanting to gather at cafeterias or barbershops (well maybe some much older people).

In [13]:
# remove cafeterias
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'Cafeteria']
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'College Cafeteria']

# remove restaurants
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'Steakhouse']
cincyVenues = cincyVenues[cincyVenues.VenueCategory != 'Gastropub']

print(cincyVenues.shape)
cincyVenues['VenueCategory'].unique().tolist()

(381, 9)


['Wine Bar',
 'Beer Bar',
 'Cocktail Bar',
 'Bar',
 'Lounge',
 'Brewery',
 'Whisky Bar',
 'Dive Bar',
 'Pub',
 'Sports Bar',
 'Karaoke Bar',
 'Gay Bar',
 'Ice Cream Shop',
 'Coffee Shop',
 'Hookah Bar',
 'Hotel Bar',
 'Café',
 'Donut Shop',
 'Tea Room',
 'Bubble Tea Shop']

### Map the venues

In [14]:
venueMap = makeMap()

# add approximate buisness center markers to map
for lat, lng, venue in zip(cincyVenues['VenueLatitude'], cincyVenues['VenueLongitude'], cincyVenues['VenueName']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(venueMap)
    
# display map
venueMap

### Clustering our venues by location

In order to group and restrict venues down to valid clusters we'll be using DBSCAN (Density-Based Spatial Clustering of Applications with Noise).

In [15]:
# pull venue coordinates into a matrix
coords = cincyVenues[['VenueLatitude', 'VenueLongitude']].to_numpy()

kmsPerRadian = 6371.0088
epsilon = MAX_WALK / kmsPerRadian # in km radians

cvDb = DBSCAN(eps=epsilon, min_samples=MIN_VENUES, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
clusterLabels = cvDb.labels_
numClusters = len(set(clusterLabels))
clusters = pd.Series([coords[clusterLabels == n] for n in range(numClusters)])
print('Number of clusters: {}'.format(numClusters))
set(clusterLabels)

Number of clusters: 16


{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}

In [16]:
# combine the cluster data with cincyVenues
cincyVenues['DbCluster'] = clusterLabels
cincyVenues.head(10)

Unnamed: 0,Neighborhood,BusinessDistrict,NeighborhoodLatitude,NeighborhoodLongitude,VenueName,VenueId,VenueLatitude,VenueLongitude,VenueCategory,DbCluster
0,Linwood,0,39.104213,-84.415924,Dennert H Distribtg,4f32494419836c91c7c8b7f7,39.108777,-84.421232,Wine Bar,-1
1,East Walnut Hills,1,39.128889,-84.476823,The Woodburn Brewery & Taproom,55461bf6498eac118325e62e,39.12903,-84.476892,Beer Bar,0
2,East Walnut Hills,1,39.128889,-84.476823,Myrtle's Punch House,5473d783498ec0bbca9021d6,39.124276,-84.47613,Cocktail Bar,0
3,East Walnut Hills,1,39.128889,-84.476823,The Growler House,545d54ab498ea427d9af9d2d,39.129763,-84.477778,Bar,0
5,East Walnut Hills,1,39.128889,-84.476823,The Skunk Lounge,5182cdbd498e1c1b38b47f1c,39.124213,-84.476246,Lounge,0
6,East Walnut Hills,1,39.128889,-84.476823,Cliche,5d6459abca17630008abf539,39.12382,-84.47704,Bar,0
7,Queensgate,0,39.108472,-84.533758,City West Brewing Company,580ceb4a38faa26bf32db135,39.108208,-84.525736,Brewery,1
8,Queensgate,0,39.108472,-84.533758,The Playhouse,4e9a404977c807974bd69725,39.106017,-84.541503,Bar,-1
9,Queensgate,0,39.108472,-84.533758,Royal Imports,4f3246f419836c91c7c7cd1e,39.102755,-84.526398,Wine Bar,1
10,Mt. Washington,1,39.093074,-84.387447,London Bridge,4c3942f22c8020a1fe7e8c00,39.092869,-84.387158,Bar,-1


#### Visualize the clusters

In [17]:
clusterMap = makeMap()

colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(numClusters)]

# add venues with cluster colors
for lat, lng, venue, clustN in zip(cincyVenues['VenueLatitude'], cincyVenues['VenueLongitude'], cincyVenues['VenueName'], cincyVenues['DbCluster']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    
    # red around blue for outliers otherwise random colors
    c=('blue' if clustN == -1 else colors[np.int(clustN)])
    fc=('red' if clustN == -1 else colors[np.int(clustN)])
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=fc,
        fill=True,
        fill_color=c,
        fill_opacity=0.8).add_to(clusterMap)
    
# display map
clusterMap

The red around the blue circles mark the outliers, venues more than a half mile away from any other venue. Outliers are of course bad spots for the event so we should remove them from our dataset.

In [18]:
# remove outliers
cincyVenues = cincyVenues[cincyVenues.DbCluster != -1]

print('We are now down to {}'.format(cincyVenues.shape[0])+' venues! And {}'.format(numClusters -1)+(' neighborhood sections to pick from.'))

We are now down to 325 venues! And 15 neighborhood sections to pick from.


#### And map again

In [19]:
clusterMap = makeMap()

colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(numClusters - 1)]

# add venues with cluster colors
for lat, lng, venue, clustN in zip(cincyVenues['VenueLatitude'], cincyVenues['VenueLongitude'], cincyVenues['VenueName'], cincyVenues['DbCluster']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    
    c=colors[np.int(clustN)]
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='grey',
        fill=True,
        fill_color=c,
        fill_opacity=0.8).add_to(clusterMap)
    
# display map
clusterMap

## 2 Recommendation System

### Pull in venue details from Foursquare
- Venue Hours to collect days and hours. Then filter.
- Venue Details to collect price and rating

In [20]:
# load the data from csv in order to not make too many calls
venueHours = pd.read_csv('venueHours.csv', dtype = {'Start': np.object, 'End': np.object})
venueHours.drop(columns=['Unnamed: 0'], inplace = True)

In [21]:
print(venueHours.shape)
venueHours.head()

(118, 4)


Unnamed: 0,VenueId,DayOf,Start,End
0,55461bf6498eac118325e62e,True,1200,0
1,4b0220f4f964a520eb4722e3,True,1700,230
2,53b74740498e8cb5722a80a2,True,1100,0
3,51fd186b498ecee282bedc11,True,1200,130
4,54e8dc93498e2565b4a9f267,True,1100,0


Not the greatest sampling of hours :( But we can use machine learning to fill in the blanks. To improve this we could in the future pull more hours from various datasets (e.g. Yelp Api).

#### Merge, format, and predict missing times
This works because it's reasonable to assume that similar venue categories in similar areas will have similar hours. Let's test this assumtion first though.

We will switch hours to our required range in order to normalize.

In [22]:
# loop through venues to build new df for modeling
vHourOpen = [] # df for later
vHourFeats = [] # df of categories and cluster
vHourLabels = [] # closed, partial (parially open), and open

for i, row in venueHours.iterrows():
    vId = row['VenueId']
    cVrow = cincyVenues.loc[cincyVenues['VenueId'] == vId]
    vCat = cVrow['VenueCategory'].values[0]
    catName = vCat.lower()
    vCat = [i for i, s in enumerate(VENUE_KEYS) if s in catName][0]
    vCluster = cVrow['DbCluster'].values[0]
    vHourFeats.append((vCat, vCluster))
    
    vOpen = 'closed'
    if row['DayOf']:
        # parse Hours
        vStart = int(row['Start'])
        vEnd = row['End']
        if '+' in vEnd:
            vEnd = int(vEnd.replace('+0', ''))+2400
        else:
            vEnd = int(vEnd)
        if START_TIME >= vStart and END_TIME <= vEnd:
            vOpen = 'open'
        # next is a test for partial time open but we are going to just call them open here
        elif START_TIME >= vStart or (START_TIME < vStart and END_TIME > vStart):
            vOpen = 'open'
    
    vHourOpen.append((vId, vOpen))
    vHourLabels.append(vOpen)

vHourOpen = pd.DataFrame(vHourOpen)
vHourOpen.columns = ['VenueId', 'VenueOpen']

vHourFeats = pd.DataFrame(vHourFeats)
vHourFeats.columns = ['VenueCategory', 'VenueCluster']

vHourFeats.head()

Unnamed: 0,VenueCategory,VenueCluster
0,0,0
1,0,2
2,0,2
3,2,2
4,2,1


In [23]:
X = preprocessing.StandardScaler().fit(vHourFeats).transform(vHourFeats)
Y = vHourLabels
print(X[0:5])
print(len(X))
print(len(Y))
Y[0:5]

[[-0.97464713 -1.18750116]
 [-0.97464713 -0.62133899]
 [-0.97464713 -0.62133899]
 [-0.28390622 -0.62133899]
 [-0.28390622 -0.90442008]]
118
118


['open', 'open', 'open', 'open', 'open']

#### Time to model and test

Prep the test and train data

In [24]:
# this was sampled multiple times to find the best size
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.23, random_state=4)
print ('Train set:', len(X_train),  len(Y_train))
print ('Test set:', len(X_test),  len(Y_test))
# we want to make sure there is a good sampling
u, count = np.unique(Y, return_counts=True)
print(u, count/len(Y))
uTest, countTest = np.unique(Y_test, return_counts=True)
print(uTest, countTest/len(Y_test))

Train set: 90 90
Test set: 28 28
['closed' 'open'] [0.07627119 0.92372881]
['closed' 'open'] [0.10714286 0.89285714]


#### k-nearest neighbors (KNN)
This model should be suffiecient for our needs.

In [25]:
Ks = 10
mean_acc = np.zeros((Ks-1))
for n in range(1,Ks):
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,Y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(Y_test, yhat)

openKNN = KNeighborsClassifier(n_neighbors = mean_acc.argmax()+1).fit(X_train,Y_train)
yhat = openKNN.predict(X_test)
print( "The best KNN's accuracy was with accuracy of", mean_acc.max(), "with k=", mean_acc.argmax()+1) 
print("Jaccard Score: ", jaccard_score(Y_test, yhat, average='weighted'))
print("F1 Score: ", f1_score(Y_test, yhat, average='weighted'))

The best KNN's accuracy was with accuracy of 0.9285714285714286 with k= 2
Jaccard Score:  0.8624338624338624
F1 Score:  0.9120879120879121


#### Use the model to make prediction on the model and remove the closed venues

In [26]:
# create a predicted value df
venuesO = pd.merge(cincyVenues, vHourOpen, on=['VenueId'], how='outer')
oId = []
oNorm = []
for i, row in venuesO.iterrows():
    if pd.isna(row['VenueOpen']):
        oId.append(row['VenueId'])
        vCat = row['VenueCategory']
        catName = vCat.lower()
        vCat = [i for i, s in enumerate(VENUE_KEYS) if s in catName][0]
        vCluster = cVrow['DbCluster']
        oNorm.append((vCat, vCluster))

oX = preprocessing.StandardScaler().fit(oNorm).transform(oNorm)        
oYhat = openKNN.predict(oX)
newOpen = pd.DataFrame(np.stack((oId, oYhat), axis=-1))
newOpen.columns = ['VenueId', 'VenueOpen']
for i, row in newOpen.iterrows():
    venuesO.loc[venuesO['VenueId'] == row['VenueId'], 'VenueOpen'] = row['VenueOpen']
venuesO['VenueOpen'].unique().tolist()

['open', 'closed']

In [27]:
print(venuesO.shape)
venuesO = venuesO[venuesO.VenueOpen != 'closed']
print(venuesO.shape)

# then drop the column
cincyVinues2 = venuesO.drop(columns=['VenueOpen'])

(325, 11)
(316, 11)


#### Recluster the venues

In [32]:
# pull venue coordinates into a matrix
coords = cincyVinues2[['VenueLatitude', 'VenueLongitude']].to_numpy()

cvDb = DBSCAN(eps=epsilon, min_samples=MIN_VENUES, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
clusterLabels = cvDb.labels_
numClusters = len(set(clusterLabels))
clusters = pd.Series([coords[clusterLabels == n] for n in range(numClusters)])
print('Number of clusters: {}'.format(numClusters))
set(clusterLabels)

# combine the cluster data with cincyVenues
cincyVinues2.drop(columns=['DbCluster'])
cincyVinues2['DbCluster'] = clusterLabels
cincyVinues2.head(10)

Number of clusters: 15


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [33]:
# remove outliers
cincyVinues2 = cincyVinues2[cincyVinues2.DbCluster != -1]

print('We are now down to {}'.format(cincyVinues2.shape[0])+' venues! And {}'.format(numClusters -1)+(' neighborhood sections to pick from.'))

We are now down to 316 venues! And 14 neighborhood sections to pick from.


In [36]:
clusterMap = makeMap()

colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(numClusters)]

# add venues with cluster colors
for lat, lng, venue, clustN in zip(cincyVinues2['VenueLatitude'], cincyVinues2['VenueLongitude'], cincyVinues2['VenueName'], cincyVinues2['DbCluster']):
    label = '{}'.format(venue)
    label = folium.Popup(label, parse_html=True)
    
    c=colors[np.int(clustN)]
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='grey',
        fill=True,
        fill_color=c,
        fill_opacity=0.8).add_to(clusterMap)
    
# display map
clusterMap

#### Pull details for open venues

In [52]:
vDetails


[(0, 8.4, 33, None),
 (0, 8.3, 38, 3),
 (0, 7.5, 25, 2),
 (0, None, 1, 3),
 (0, None, 0, 2),
 (1, None, 1, 2),
 (1, None, 0, 2),
 (2, 8.3, 20, 2),
 (2, 8.2, 82, 2),
 (2, 8.7, 38, 3),
 (2, 8.2, 66, 2),
 (2, 7.9, 8, None),
 (2, 7.7, 9, 3),
 (2, 8.2, 11, 1),
 (2, 7.8, 30, 2),
 (2, 7.3, 19, 1),
 (2, 6.6, 23, 2),
 (2, 5.9, 16, 1),
 (2, None, 0, 2),
 (2, None, 0, 2),
 (2, None, 1, 2),
 (2, None, 5, 1),
 (3, None, 1, 1),
 (3, None, 1, 2),
 (3, None, 0, 2),
 (3, None, 0, 2),
 (3, None, 0, 2),
 (3, None, 0, 2),
 (3, None, 1, 2),
 (4, None, 2, 2),
 (4, None, 0, 2),
 (4, None, 0, 2),
 (1, 9.3, 304, 2),
 (1, 7.6, 8, 2),
 (1, 5.6, 5, 2),
 (1, None, 4, 3),
 (1, None, 6, None),
 (1, None, 3, 3),
 (1, 8.2, 39, 2),
 (1, 8.4, 140, 1),
 (1, 7.6, 22, 2),
 (1, 8.3, 11, 2),
 (1, 7.9, 66, 1),
 (1, 7.8, 14, 2),
 (1, 7.9, 20, 3),
 (5, 7.2, 8, 2),
 (1, 6.9, 10, 1),
 (6, 9.1, 132, 1),
 (6, 7.0, 29, 2),
 (6, 6.6, 6, 1),
 (6, 5.7, 8, 2),
 (6, None, 2, 2),
 (6, None, 1, 2),
 (5, None, 0, 3),
 (5, None, 0, 2),
 (5, 

In [54]:
venueDetails = pd.DataFrame(vDetails)

# define the column names
venueDetails.columns = ['DbCluster', 'Rating', 'Likes', 'Price']

In [55]:
# save for later
venueDetails.to_csv('venueDetails-160-4b8dcb4bf964a520070e33e3.csv')

### Predict missing data

In [None]:
# predicting ratings is pretty pointless (average of the cluster) but we can predict price

In [None]:
# predict missing price

In [None]:
# remove prices not in range

### Cluster new data
#### Cleanup clusters based on our criteria (minimum values for primary and secondary venues)
This could have been done earlier before our other cleanup.

In [None]:
#

In [None]:
# 

### Analyze the clusters

In [None]:
# charts, charts and more charts... maybe more modeling

In [None]:
# probably graph the amount of options, ratings, likes

### Build recommendation ranking

In [None]:
# recommend districts based on ratings and options

## Main Event
#### Crawling Sponsors or Traveling Salesman

In [None]:
# taking the winning recommendation and figure out the amount of unique paths, find the best path and map it as a traveling salesman