## Capstone Project - Week 4

### 1. Introduction / Business Problem

Classify the eating places in Manhattan based on eating preferences

### 2. Data

Foursquare data; 
use the restaurant type, number/density of each type, ratings 

In [1]:
# imports:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
import math

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import html5lib
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

!conda install -c conda-forge geocoder --yes 
import geocoder # import geocoder

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
import json # library to handle JSON files
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library    








    

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          90 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.49   | 32 KB     | ##

### 2.1 Downloading the New York data set

Download the data:

In [2]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
    
neighborhoods_data = newyork_data['features']

Data downloaded!


Transform data into a pandas dataframe and fill the frame

In [3]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

# fill the data
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
   

Select Manhattan data              

In [4]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()
#print(manhattan_data.describe(include='all'))

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


Visualize Manhattan neighborhoods

In [5]:
address = 'Manhattan, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

The geograpical coordinate of Manhattan are 40.7900869, -73.9598295.


### 3 Define Foursquare credentials and explore the data

In [10]:
CLIENT_ID = 'JZF5D2H42PAKWESTY5NMYK3QZHFXANJSRPOISZJBZHHV3AGJ' # your Foursquare ID
CLIENT_SECRET = 'YVUK4SK5HQRTOWTYZ5D1U2W3OGIJGNCIVOZTMGTR1X4WGMT2' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JZF5D2H42PAKWESTY5NMYK3QZHFXANJSRPOISZJBZHHV3AGJ
CLIENT_SECRET:YVUK4SK5HQRTOWTYZ5D1U2W3OGIJGNCIVOZTMGTR1X4WGMT2


Function to retrieve neighborhood venues

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['id'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue ID', 
                  'Venue Category']
            
    return(nearby_venues)

Crreate dataframe with neighborhoods and venues

In [12]:
manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )

print(manhattan_venues.shape)
manhattan_venues.head(10)

   


(1182, 8)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue ID,Venue Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,4bf58dd8d48988d1ca941735,Pizza Place
1,Marble Hill,40.876551,-73.91066,Bikram Yoga,40.876844,-73.906204,4bf58dd8d48988d102941735,Yoga Studio
2,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,4bf58dd8d48988d147941735,Diner
3,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,4bf58dd8d48988d1e0931735,Coffee Shop
4,Marble Hill,40.876551,-73.91066,Dunkin',40.877136,-73.906666,4bf58dd8d48988d148941735,Donut Shop
5,Marble Hill,40.876551,-73.91066,Blink Fitness Riverdale,40.877147,-73.905837,4bf58dd8d48988d176941735,Gym
6,Marble Hill,40.876551,-73.91066,TCR The Club of Riverdale,40.878628,-73.914568,4e39a891bd410d7aed40cbc2,Tennis Stadium
7,Marble Hill,40.876551,-73.91066,Land & Sea Restaurant,40.877885,-73.905873,4bf58dd8d48988d1ce941735,Seafood Restaurant
8,Marble Hill,40.876551,-73.91066,T.J. Maxx,40.877232,-73.905042,4bf58dd8d48988d1f6941735,Department Store
9,Marble Hill,40.876551,-73.91066,Starbucks,40.873755,-73.908613,4bf58dd8d48988d1e0931735,Coffee Shop


Keep only the categories of interest:
- Restaurant: venue category contains 'restaurant', 'bodega', or 'diner'
- Cafe: venue category contains 'cafe' or 'coffee'
- FastFood: venue category contains 'joint', 'bagel', 'pizza', 'breakfast', 'burger', 'burrito', 'creperie', 'fast food', 'pastry', 'sandwich', 'snack', or 'taco'
- Others: to be discarded

In [13]:
manhattan_venues2 = manhattan_venues

manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('restaurant',  case=False), 'Venue Category'] = 'Restaurant'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('bodega', case=False), 'Venue Category'] = 'Restaurant'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('diner', case=False), 'Venue Category'] = 'Restaurant'

manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('cafe',  case=False), 'Venue Category'] = 'Cafe'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('coffee', case=False), 'Venue Category'] = 'Cafe'

manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('joint',  case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('bagel', case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('pizza',  case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('breakfast', case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('burger',  case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('burrito', case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('creperie',  case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('fast food', case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('pastry',  case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('sandwich', case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('snack',  case=False), 'Venue Category'] = 'FastFood'
manhattan_venues2.loc[manhattan_venues2['Venue Category'].str.contains('taco', case=False), 'Venue Category'] = 'FastFood'

# remove the others
manhattan_venues2 = manhattan_venues2[(manhattan_venues2['Venue Category'] == 'Restaurant') | (manhattan_venues2['Venue Category'] == 'Cafe') | (manhattan_venues2['Venue Category'] == 'FastFood')]
manhattan_venues2 = manhattan_venues2.reset_index(drop = True)

manhattan_venues2.head(10)
#print(manhattan_venues2[manhattan_venues2.Venue == 'Starbucks'])

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue ID,Venue Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,4bf58dd8d48988d1ca941735,FastFood
1,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,4bf58dd8d48988d147941735,Restaurant
2,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,4bf58dd8d48988d1e0931735,Cafe
3,Marble Hill,40.876551,-73.91066,Land & Sea Restaurant,40.877885,-73.905873,4bf58dd8d48988d1ce941735,Restaurant
4,Marble Hill,40.876551,-73.91066,Starbucks,40.873755,-73.908613,4bf58dd8d48988d1e0931735,Cafe
5,Marble Hill,40.876551,-73.91066,Boston Market,40.87743,-73.905412,4bf58dd8d48988d14e941735,Restaurant
6,Marble Hill,40.876551,-73.91066,SUBWAY,40.878493,-73.905385,4bf58dd8d48988d1c5941735,FastFood
7,Marble Hill,40.876551,-73.91066,Subway,40.87772,-73.90538,4bf58dd8d48988d1c5941735,FastFood
8,Marble Hill,40.876551,-73.91066,Terrace View Delicatessen,40.876476,-73.912746,4bf58dd8d48988d146941735,Restaurant
9,Chinatown,40.715618,-73.994279,Kiki's,40.714476,-73.992036,4bf58dd8d48988d10e941735,Restaurant


Check size:

In [14]:
manhattan_venues2.shape

(451, 8)

In [23]:
# for each neighborhood, show the number of Cafes, Restaurants and FastFoods
mcounts = manhattan_venues2

mcounts = mcounts.drop(['Venue', 'Venue Latitude', 'Venue Longitude', 'Venue ID'], axis=1)
#mcounts.head()
mcounts = mcounts.pivot_table(index=['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude'],columns='Venue Category',aggfunc=len)
#mcounts.head()

mcounts.fillna(0,inplace=True)
#mcounts.columns = mcounts.columns.droplevel()
mcounts.columns.name = None
mcounts.reset_index(inplace=True)
#mcounts.head()

mcounts2 = mcounts.drop(['Neighborhood Latitude', 'Neighborhood Longitude'], axis=1)
mcounts2['Venues'] = mcounts2['Cafe'] + mcounts2['FastFood'] + mcounts2['Restaurant'];
mcounts2.head(40)

Unnamed: 0,Neighborhood,Cafe,FastFood,Restaurant,Venues
0,Battery Park City,2.0,4.0,1.0,7.0
1,Carnegie Hill,2.0,4.0,7.0,13.0
2,Central Harlem,0.0,3.0,11.0,14.0
3,Chelsea,1.0,1.0,11.0,13.0
4,Chinatown,0.0,3.0,10.0,13.0
5,Civic Center,1.0,1.0,11.0,13.0
6,Clinton,0.0,2.0,3.0,5.0
7,East Harlem,1.0,2.0,11.0,14.0
8,East Village,2.0,4.0,9.0,15.0
9,Financial District,2.0,2.0,5.0,9.0


Show the most common venue type

In [24]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:4]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [53]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = mcounts2['Neighborhood']

for ind in np.arange(mcounts2.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(mcounts2.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted['Venues'] = mcounts2['Venues']

neighborhoods_venues_sorted.head(20)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,Venues
0,Battery Park City,FastFood,Cafe,Restaurant,7.0
1,Carnegie Hill,Restaurant,FastFood,Cafe,13.0
2,Central Harlem,Restaurant,FastFood,Cafe,14.0
3,Chelsea,Restaurant,FastFood,Cafe,13.0
4,Chinatown,Restaurant,FastFood,Cafe,13.0
5,Civic Center,Restaurant,FastFood,Cafe,13.0
6,Clinton,Restaurant,FastFood,Cafe,5.0
7,East Harlem,Restaurant,FastFood,Cafe,14.0
8,East Village,Restaurant,FastFood,Cafe,15.0
9,Financial District,Restaurant,FastFood,Cafe,9.0


### 4. Clustering
From the above analysis we found out that the most common place is restaurant


In [54]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


# set number of clusters
kclusters = 3

manhattan_grouped_clustering = mcounts2.drop('Neighborhood', 1)
X = manhattan_grouped_clustering.values[:,1:]
X = np.nan_to_num(X)
manhattan_grouped_clustering = StandardScaler().fit_transform(X)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 2, 1, 2, 1, 0, 2, 2, 0], dtype=int32)

In [55]:

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = manhattan_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

manhattan_merged = manhattan_merged.sort_values('Neighborhood')
manhattan_merged.dropna(inplace=True)
manhattan_merged.reset_index(inplace=True)

manhattan_merged.head(40) # check the last columns!

Unnamed: 0,index,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,Venues
0,28,Manhattan,Battery Park City,40.711932,-74.016869,0.0,FastFood,Cafe,Restaurant,7.0
1,30,Manhattan,Carnegie Hill,40.782683,-73.953256,2.0,Restaurant,FastFood,Cafe,13.0
2,6,Manhattan,Central Harlem,40.815976,-73.943211,2.0,Restaurant,FastFood,Cafe,14.0
3,17,Manhattan,Chelsea,40.744035,-74.003116,1.0,Restaurant,FastFood,Cafe,13.0
4,1,Manhattan,Chinatown,40.715618,-73.994279,2.0,Restaurant,FastFood,Cafe,13.0
5,32,Manhattan,Civic Center,40.715229,-74.005415,1.0,Restaurant,FastFood,Cafe,13.0
6,14,Manhattan,Clinton,40.759101,-73.996119,0.0,Restaurant,FastFood,Cafe,5.0
7,7,Manhattan,East Harlem,40.792249,-73.944182,2.0,Restaurant,FastFood,Cafe,14.0
8,19,Manhattan,East Village,40.727847,-73.982226,2.0,Restaurant,FastFood,Cafe,15.0
9,29,Manhattan,Financial District,40.707107,-74.010665,0.0,Restaurant,FastFood,Cafe,9.0


Show the clusters

In [56]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters