In [1]:
import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

from geopy.geocoders import Nominatim
import requests
import urllib.request
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

In [2]:
with urllib.request.urlopen("https://cocl.us/new_york_dataset") as url:
    newyork_data = json.loads(url.read().decode())
neighborhoods_data = newyork_data['features']

In [3]:
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [4]:
address = 'Manhattan, NY'
#geolocator = Nominatim(user_agent="ny_explorer")
#location = geolocator.geocode(address)
latitude = 40.7896239 #location.latitude
longitude = -73.9598939 #location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 40.7896239, -73.9598939.


In [5]:
# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

In [6]:
CLIENT_ID = 'BAWRJFKCVYW0BGD02LMO5KYR1IPKQZ0TETFYKIYZ42KKD3ZA' # your Foursquare ID
CLIENT_SECRET = 'NAOULUXFI0SZNYFOC3K01QKMUI03PX5YC2BE5BYHEE1P055S' # your Foursquare Secret
VERSION = '20180605'

In [7]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, v['venue']['id'], v['venue']['name'], v['venue']['location']['lat'], 
                             v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'NBHD Lat', 'NBHD Long', 'ID', 'Venue', 'Venue Lat', 'Venue Long', 
                             'Venue Category']
    
    return(nearby_venues)

In [8]:
manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'], latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude'])

manhattan_venues.head()

Unnamed: 0,Neighborhood,NBHD Lat,NBHD Long,ID,Venue,Venue Lat,Venue Long,Venue Category
0,Marble Hill,40.876551,-73.91066,4b4429abf964a52037f225e3,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,4baf59e8f964a520a6f93be3,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,4b79cc46f964a520c5122fe3,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,55f81cd2498ee903149fcc64,Starbucks,40.877531,-73.905582,Coffee Shop
4,Marble Hill,40.876551,-73.91066,4b5357adf964a520319827e3,Dunkin',40.877136,-73.906666,Donut Shop


In [9]:
# one hot encoding
manhattan_onehot = pd.get_dummies(manhattan_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = manhattan_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()

# set number of clusters
kclusters = 5
manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

neighborhoods_name = manhattan_grouped[['Neighborhood']]
neighborhoods_name.insert(1, 'Cluster Labels', kmeans.labels_)
manhattan_merged = pd.merge(manhattan_data, neighborhoods_name, how = "left", on = ["Neighborhood"])
manhattan_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,Manhattan,Marble Hill,40.876551,-73.91066,3
1,Manhattan,Chinatown,40.715618,-73.994279,0
2,Manhattan,Washington Heights,40.851903,-73.9369,4
3,Manhattan,Inwood,40.867684,-73.92121,4
4,Manhattan,Hamilton Heights,40.823604,-73.949688,4


In [10]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [11]:
related_venues = manhattan_venues[(manhattan_venues['Venue Category'].str.contains('Sushi')==True)|
                                  (manhattan_venues['Venue Category'].str.contains('Japan')==True)]
related_venues.reset_index(drop=True,inplace=True)
labeled_related_venues = pd.merge(related_venues, neighborhoods_name, how = "left", on = ["Neighborhood"])

rating = []
for venue_id in labeled_related_venues['ID']:
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)

    result = requests.get(url).json()
    try:
        rating.append(float(result['response']['venue']['rating']))
    except:
        rating.append(0.0)

labeled_related_venues.insert(9, 'Rating', rating)
labeled_related_venues.head()

Unnamed: 0,Neighborhood,NBHD Lat,NBHD Long,ID,Venue,Venue Lat,Venue Long,Venue Category,Cluster Labels,Rating
0,Washington Heights,40.851903,-73.9369,4a4a642cf964a520bdab1fe3,Sushi Yu II,40.851255,-73.939652,Sushi Restaurant,4,6.9
1,Hamilton Heights,40.823604,-73.949688,531a0207498e06b4656eb8c5,Geisha Japanese Cuisine,40.824355,-73.951993,Japanese Restaurant,4,7.7
2,Hamilton Heights,40.823604,-73.949688,5674592d498e85f07427e61b,Chopped Parsley,40.825911,-73.947454,Sushi Restaurant,4,7.7
3,Hamilton Heights,40.823604,-73.949688,577875a2498e0a21a3ef1a97,MamaSushi,40.827392,-73.949839,Sushi Restaurant,4,7.3
4,Manhattanville,40.816934,-73.957385,57a4cbac498ee795c5d0b864,Go! Go! Curry!,40.815473,-73.958234,Japanese Curry Restaurant,4,7.8


In [12]:
venue_count = labeled_related_venues[['Neighborhood', 'Cluster Labels']].groupby('Cluster Labels').count()
venue_count.reset_index(drop=False,inplace=True)
venue_count.rename(columns={"Neighborhood": "Venue Count"}, inplace=True)

venue_count.sort_values(by=['Venue Count'], ascending=False).head(1)

Unnamed: 0,Cluster Labels,Venue Count
1,1,44


In [13]:
selected_nbhd = labeled_related_venues[labeled_related_venues['Cluster Labels']==1]

venue_count = selected_nbhd[['Neighborhood', 'Venue']].groupby('Neighborhood').count()
venue_count.reset_index(drop=False,inplace=True)
venue_count.rename(columns={"Venue": "Venue Count"}, inplace=True)

venue_rating = selected_nbhd[['Neighborhood', 'Rating']].groupby('Neighborhood').mean()
venue_rating.reset_index(drop=False,inplace=True)
venue_rating.rename(columns={"Rating": "Mean Rating"}, inplace=True)

nbhd_stats = pd.merge(venue_count, venue_rating, how = "left", on = ["Neighborhood"])

cnt_list = venue_count['Venue Count'].unique().tolist()
cnt_list.sort(reverse=True)
cnt_rank = pd.DataFrame(map(list, zip(*[cnt_list,list(range(1,len(cnt_list)+1))])), 
                        columns=['Venue Count', 'Count Score'])
nbhd_stats = pd.merge(nbhd_stats, cnt_rank, how = "left", on = ["Venue Count"])

rating_list = venue_rating['Mean Rating'].unique().tolist()
rating_list.sort(reverse=True)
rating_rank = pd.DataFrame(map(list, zip(*[rating_list,list(range(1,len(rating_list)+1))])), 
                           columns=['Mean Rating', 'Rating Score'])
nbhd_stats = pd.merge(nbhd_stats, rating_rank, how = "left", on = ["Mean Rating"])

rank_list = []
for c_rank, r_rank in zip(nbhd_stats['Count Score'],nbhd_stats['Rating Score']):
    rank_list.append(c_rank + r_rank)

nbhd_stats['Score'] = rank_list
nbhd_stats.sort_values(by=['Score'], ascending=False).head(1)

Unnamed: 0,Neighborhood,Venue Count,Mean Rating,Count Score,Rating Score,Score
6,Manhattan Valley,2,7.8,5,12,17


In [17]:
nbhd_stats.sort_values(by=['Score'], ascending=False)

Unnamed: 0,Neighborhood,Venue Count,Mean Rating,Count Score,Rating Score,Score
6,Manhattan Valley,2,7.8,5,12,17
9,Sutton Place,2,7.95,5,10,15
0,Carnegie Hill,5,7.88,3,11,14
11,Upper West Side,2,8.05,5,9,14
4,Gramercy,1,8.1,6,7,13
12,Yorkville,5,8.06,3,8,11
3,Financial District,2,8.6,5,4,9
8,Noho,2,8.65,5,3,8
1,Chelsea,2,8.75,5,2,7
7,Murray Hill,6,8.45,2,5,7
