# Coursera Final

In this final project we will use data from geocoder library and the FourSquare API to determine where to open a high level italian restourant based on the success of the tier 3 and 4 italian restourants in a specific neighborhood in New York city

In [288]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import copy

print('Libraries imported.')

Libraries imported.


Get all the neighborhoods in New York city and plotting them in a folium map

In [4]:
#get coordinates of new york city neighboorhoods
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


In [70]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [71]:
neighborhoods_data = newyork_data['features']

In [72]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [73]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [75]:
neighborhoods['Borough'].unique().size

5

In [76]:
neighborhoods['Neighborhood'].unique().size

302

In [77]:
nb_to_examin = neighborhoods['Borough'].unique()
nb_to_examin

array(['Bronx', 'Manhattan', 'Brooklyn', 'Queens', 'Staten Island'],
      dtype=object)

In [78]:
#create a list with the 5 dataframes
nb_dataframes=[]
i=0
for n in nb_to_examin:
    nb_dataframes.append(neighborhoods[neighborhoods['Borough'] == nb_to_examin[i]].reset_index(drop=True))
    i+=1

In [79]:
# create map of Manhattan using latitude and longitude values
ny_map = folium.Map(location=[40.730610, -73.935242], zoom_start=11)
i=0
colors = ['blue','red','green','orange','black']
# add markers to map
for n in nb_dataframes:
    for lat, lng, label in zip(nb_dataframes[i]['Latitude'], nb_dataframes[i]['Longitude'], nb_dataframes[i]['Neighborhood'] + ', '+nb_dataframes[i]['Borough']):
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=colors[i],
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(ny_map)
    i+=1
    
    
ny_map

In [480]:
CLIENT_ID = 'TJHUBCYUEQZJ**************HPWRDNINFWBIX1ZBH' # your Foursquare ID
CLIENT_SECRET = 'EB2CX2KIWDL*************YRUPVJVXYZY41QGCSXJ1W4Z' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TJHUBCYUEQZJ**************HPWRDNINFWBIX1ZBH
CLIENT_SECRET:EB2CX2KIWDL*************YRUPVJVXYZY41QGCSXJ1W4Z


In [103]:
#function to collect the price tiers of all the venues in the dataframe
def getPriceTier(venues):
    price_list = []
    i=0
    e=0
    #iterate all the rows inside the dataframe
    for n in venues:
        e=0
        for index, row in venues[i].iterrows():
            venue_id = venues[i]['Id'][e]
            url = 'https://api.foursquare.com/v2/venues/'+venue_id+'?&client_id={}&client_secret={}&v={}'.format(CLIENT_ID, CLIENT_SECRET,VERSION)
            e+=1
            
            results = requests.get(url).json()["response"]['venue']
            
            if 'price' in results:
                name = results['name']
                price = results['price']['tier']
                price_list.append([name, price])
            else:
                name = results['name']
                price = 'NaN'
                price_list.append([name, price])
            
            print(price)
            
        
        final_price_list.append(price_list)
        i+=1
        price_list = []
        
    return(final_price_list)
                                                                                                                                

In [120]:
#get venues but excluding the places that are not italian restourants

def getNearbyVenuesFood(names, latitudes, longitudes, radius=1000):
    LIMIT=150
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        for v in results:
            if v['venue']['categories'][0]['icon']['prefix'].find('food') != -1 and v['venue']['categories'][0]['name'] == 'Italian Restaurant':
                venues_list.append([(
                    name,
                    lat, 
                    lng, 
                    v['venue']['name'],
                    v['venue']['id'],
                    v['venue']['location']['lat'], 
                    v['venue']['location']['lng'],  
                    v['venue']['categories'][0]['name'])])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue',
                  'Id',
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#get all the price and rating for the venues
def getPriceTierOnce(venues):
    price_list = []
    i=0
    e=venues.index[0]
    #iterate all the rows inside the dataframe
    for index, row in venues.iterrows():
        venue_id = venues['Id'][e]
        url = 'https://api.foursquare.com/v2/venues/'+venue_id+'?&client_id={}&client_secret={}&v={}'.format(CLIENT_ID, CLIENT_SECRET,VERSION)
        e+=1

        results = requests.get(url).json()["response"]['venue']

        if 'price' in results and 'rating' in results:
            v_id = results['id']
            price = results['price']['tier']
            rating = results['rating']
            price_list.append([v_id, price, rating])
        else:
            v_id = results['id']
            price = 'NaN'
            rating = 'NaN'
            price_list.append([v_id, price, rating])

        print(price)
        print(e)
        
    return price_list
                                

Get Italian Restaurants in all the neighborhoods

In [169]:
#cycle all the neighborhoods with the forsquare api to get the food venues arount those neighborhoods
food_data=[]
i=0
for n in nb_dataframes:
    food_data.append(getNearbyVenuesFood(names=nb_dataframes[i]['Neighborhood'],
                                   latitudes=nb_dataframes[i]['Latitude'],
                                   longitudes=nb_dataframes[i]['Longitude']
                                  ))
    i+=1

Getting the price data of the restaurant, this has to be done once at a time due to limits in api calls per day

In [168]:
bronx_price = []

bronx_price = getPriceTierOnce(food_data[0])

In [167]:
manhattan_price = []

manhattan_price = getPriceTierOnce(food_data[1])

In [166]:
brooklyn_price = []

brooklyn_price = getPriceTierOnce(food_data[2])

In [165]:
queens_price = []

queens_price = getPriceTierOnce(food_data[3])

In [164]:
sisland_price = []

sisland_price = getPriceTierOnce(food_data[4])

In [137]:
price_list = [bronx_price, manhattan_price, brooklyn_price, queens_price, sisland_price]
price_df = []
for n in price_list:
    merged = pd.DataFrame(n)
    merged.columns = ['Id', 'Tier', 'Review']
    price_df.afppend(merged)

In [144]:
final_df = []
i = 0
for n in price_df: 
    df_merged = food_data[i].merge(price_df[i], how='outer', left_index=True, right_index=True)
    i+=1
    final_df.append(df_merged)
    

In [160]:
i = 0
for n in final_df:
    final_df[i] = final_df[i].loc[final_df[i]['Tier'] != 'NaN']
    i+=1

In [163]:
i = 0
for n in final_df:
    n.to_csv(str(nb_to_examin[i])+".csv", index=False)
    i+=1

In [193]:
#final_df now contains all the info of neighborhoods italian restaurants and relative tier and price point
final_df[4]

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Id_x,Venue Latitude,Venue Longitude,Venue Category,Id_y,Tier,Review
0,St. George,40.644982,-74.079353,Enoteca Maria,4a271f0cf964a5205c911fe3,40.641941,-74.07732,Italian Restaurant,4a271f0cf964a5205c911fe3,2,8.3
1,St. George,40.644982,-74.079353,Pier 76 Italian Restaurant,4b76d8dbf964a520be632ee3,40.64014,-74.075654,Italian Restaurant,4b76d8dbf964a520be632ee3,1,8.5
2,New Brighton,40.640615,-74.087017,Enoteca Maria,4a271f0cf964a5205c911fe3,40.641941,-74.07732,Italian Restaurant,4a271f0cf964a5205c911fe3,2,8.3
3,New Brighton,40.640615,-74.087017,Pier 76 Italian Restaurant,4b76d8dbf964a520be632ee3,40.64014,-74.075654,Italian Restaurant,4b76d8dbf964a520be632ee3,1,8.5
4,Rosebank,40.615305,-74.069805,Tony's Brick Oven Pizzeria,4bcc8e3bb6c49c7422ed9391,40.61518,-74.067279,Italian Restaurant,4bcc8e3bb6c49c7422ed9391,2,8.1
5,Rosebank,40.615305,-74.069805,Bin 5,4e179bdc52b123a586ceea11,40.613278,-74.065459,Italian Restaurant,4e179bdc52b123a586ceea11,2,8.0
6,Rosebank,40.615305,-74.069805,Da Noi Fingerboard,4bbb8986e452952190d154a4,40.60699,-74.06678,Italian Restaurant,4bbb8986e452952190d154a4,2,7.7
8,West Brighton,40.631879,-74.107182,Panini Grill,4b5b58dbf964a52088f628e3,40.630137,-74.108156,Italian Restaurant,4b5b58dbf964a52088f628e3,2,8.4
9,West Brighton,40.631879,-74.107182,Cafe Milano,4bf872974a67c928b57725cf,40.631014,-74.102905,Italian Restaurant,4bf872974a67c928b57725cf,3,7.7
10,West Brighton,40.631879,-74.107182,Pastosa's,4b6b0b96f964a52001ef2be3,40.629078,-74.114608,Italian Restaurant,4b6b0b96f964a52001ef2be3,2,7.7


Define functions and prepare dataframe for K-means algorithm

In [450]:
#return only venues that are above a specific tier
def getTier(dataf):
    tier_df = dataf.loc[dataf['Tier'] > 2 ]
    return tier_df
#collect the means of all the Restourants in a specific neighborhood
def collect(lst):
    ls = []
    for n in lst:
        n["Tier"] = pd.to_numeric(n["Tier"])
        n["Review"] = pd.to_numeric(n["Review"])
        df = n.groupby('Neighborhood').mean().reset_index()
        ls.append(df)
    return ls
#make the dataframe ready to be used in the k-means algo
def kPreparation(lst):
    ls = []
    for n in lst:
        n = n.drop(['Neighborhood Latitude', 'Neighborhood Longitude', 'Venue Latitude', 'Venue Longitude'], axis=1)
        ls.append(n)
    return ls

def kMeans(lst, n_clst):
    ls = []
    for n in lst:
        on_grouped_clustering = n.drop('Neighborhood', 1)

        # run k-means clustering
        kmeans = KMeans(n_clusters=n_clst, random_state=0).fit(on_grouped_clustering)
        
        ls.append(kmeans.labels_)
    return ls
#get a new dataframe with the results of a single cluster
def getFinal(dataf, numbr):
    tier_df = dataf.loc[dataf['Cluster Labels'] == numbr]
    return tier_df

In [358]:
neighborhoods_tiers = []
for i in final_df:
    neighborhoods_tiers.append(getTier(i))

In [359]:
neighborhoods_tiers[0]

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Id_x,Venue Latitude,Venue Longitude,Venue Category,Id_y,Tier,Review
4,Baychester,40.866858,-73.835798,Fratelli's,4c9518076b35a143d5dc21dc,40.863019,-73.843607,Italian Restaurant,4c9518076b35a143d5dc21dc,3,8.7
6,Pelham Parkway,40.857413,-73.854756,Enzo's,4bf96ae65317a593d837017f,40.854232,-73.854362,Italian Restaurant,4bf96ae65317a593d837017f,3,8.2
9,City Island,40.847247,-73.786488,Artie's Steak and Seafood,4514ed4df964a520e5391fe3,40.849542,-73.787317,Italian Restaurant,4514ed4df964a520e5391fe3,4,8.5
11,City Island,40.847247,-73.786488,Portofino Restaurant,4b80a1c3f964a520488230e3,40.853321,-73.790693,Italian Restaurant,4b80a1c3f964a520488230e3,3,5.7
14,Fordham,40.860997,-73.896427,Michaelangelo's Coal Fired Brick Oven Pizza Re...,4af71916f964a520cf0522e3,40.857412,-73.886468,Italian Restaurant,4af71916f964a520cf0522e3,3,8.4
15,Fordham,40.860997,-73.896427,Dominick's Restaurant,4aa425bef964a5208e4520e3,40.854215,-73.888765,Italian Restaurant,4aa425bef964a5208e4520e3,3,8.5
16,Fordham,40.860997,-73.896427,Emilia's Restaurant,4b5a4c8ff964a52049bb28e3,40.85412,-73.88891,Italian Restaurant,4b5a4c8ff964a52049bb28e3,3,8.2
19,Fordham,40.860997,-73.896427,Ann & Tony's - An Original Arthur Avenue Resta...,4bc67f1104e8b713a833362d,40.855704,-73.887449,Italian Restaurant,4bc67f1104e8b713a833362d,3,7.0
21,Melrose,40.819754,-73.909422,Venice Restaurant & Pizzeria,4bb692672ea19521a6ecab2f,40.812799,-73.907335,Italian Restaurant,4bb692672ea19521a6ecab2f,3,7.3
27,Throgs Neck,40.815109,-73.81635,Spoto's Italian Restaurant,4c1d717b8b3aa59363a99a5f,40.820399,-73.817702,Italian Restaurant,4c1d717b8b3aa59363a99a5f,3,6.5


In [435]:

final_ls = collect(neighborhoods_tiers)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [441]:
final_ls[0]
final_to_k = final_ls[0].append(final_ls[1].append(final_ls[2].append(final_ls[3].append(final_ls[4])))).reset_index()

In [442]:
#prepare the dataframe to go under the k-means algo
kdf = final_to_k.drop(['index', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue Latitude', 'Venue Longitude'], axis=1)

In [444]:
kdf

Unnamed: 0,Neighborhood,Tier,Review
0,Baychester,3.0,8.7
1,Belmont,3.0,8.3
2,Bronxdale,3.0,8.5
3,City Island,3.5,7.1
4,Edgewater Park,3.0,6.5
5,Fordham,3.0,8.025
6,Melrose,3.0,7.3
7,Morris Park,3.0,8.2
8,North Riverdale,3.0,6.9
9,Pelham Bay,3.0,6.8


Cluster Neighborhoods based on means of tier and reviews

In [446]:
#number of cluster to create
kclusters = 4

#drop neighborhood column
on_grouped_clustering = kdf.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(on_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 0, 1, 2, 2, 0, 0, 0, 2, 2, 1, 0, 2, 2, 1, 3, 1, 3, 1, 1, 3, 1,
       0, 1, 1, 1, 1, 0, 3, 3, 1, 1, 1, 3, 3, 3, 1, 3, 1, 1, 3, 1, 0, 3,
       3, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 2, 1, 0, 3, 2, 0, 2,
       2, 1, 0, 0, 0, 0, 3, 1, 0, 0, 2, 3, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 3, 0, 2], dtype=int32)

In [447]:
#merge neighborhood data and means of tier and review with their clusterized data
final_merged = copy.copy(final_to_k)

final_merged['Cluster Labels'] = kmeans.labels_
    
final_merged


Unnamed: 0,index,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude,Tier,Review,Cluster Labels
0,0,Baychester,40.866858,-73.835798,40.863019,-73.843607,3.0,8.7,1
1,1,Belmont,40.857277,-73.888452,40.854571,-73.888366,3.0,8.3,0
2,2,Bronxdale,40.852723,-73.861726,40.851499,-73.858302,3.0,8.5,1
3,3,City Island,40.847247,-73.786488,40.851432,-73.789005,3.5,7.1,2
4,4,Edgewater Park,40.821986,-73.813885,40.820399,-73.817702,3.0,6.5,2
5,5,Fordham,40.860997,-73.896427,40.855363,-73.887898,3.0,8.025,0
6,6,Melrose,40.819754,-73.909422,40.812799,-73.907335,3.0,7.3,0
7,7,Morris Park,40.847549,-73.850402,40.854232,-73.854362,3.0,8.2,0
8,8,North Riverdale,40.908543,-73.904531,40.906483,-73.903965,3.0,6.9,2
9,9,Pelham Bay,40.850641,-73.832074,40.853225,-73.827305,3.0,6.8,2


In [479]:
#visualize the clusterized neighborhoods
map_clusters = folium.Map(location=[40.730610, -73.935242], zoom_start=11)

# add markers to the map
markers_colors = ['blue','red','green','orange','black']
i=0
k=0

for lat, lon, poi, cluster in zip(final_merged['Neighborhood Latitude'], final_merged['Neighborhood Longitude'], final_merged['Neighborhood'], final_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=markers_colors[final_merged['Cluster Labels'][i]],
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_clusters)
    i+=1
map_clusters

In [471]:
map_clusters.save(outfile='allRestScores.html')

In [458]:
#create dataframe with only members of cluster 2, the ones that have a very high Tier but very low review
interested_df = getFinal(final_merged, 2).reset_index()

In [462]:
interested_df

Unnamed: 0,level_0,index,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude,Tier,Review,Cluster Labels
0,3,3,City Island,40.847247,-73.786488,40.851432,-73.789005,3.5,7.1,2
1,4,4,Edgewater Park,40.821986,-73.813885,40.820399,-73.817702,3.0,6.5,2
2,8,8,North Riverdale,40.908543,-73.904531,40.906483,-73.903965,3.0,6.9,2
3,9,9,Pelham Bay,40.850641,-73.832074,40.853225,-73.827305,3.0,6.8,2
4,12,12,Schuylerville,40.82658,-73.826203,40.820399,-73.817702,3.0,6.5,2
5,13,13,Throgs Neck,40.815109,-73.81635,40.820399,-73.817702,3.0,6.5,2
6,59,12,Midwood,40.625596,-73.957595,40.623627,-73.964843,3.0,7.0,2
7,63,1,Auburndale,40.76173,-73.791762,40.765464,-73.788901,3.0,5.7,2
8,65,3,Beechhurst,40.792781,-73.804365,40.78855,-73.813497,3.0,6.45,2
9,66,4,Bellerose,40.728573,-73.720128,40.725422,-73.720646,3.0,6.3,2


In [478]:
#visualize interested_df
map_clusters = folium.Map(location=[40.730610, -73.935242], zoom_start=10)

# add markers to the map
markers_colors = ['blue','red','green','orange','black']
i=0

for lat, lon, poi, cluster in zip(interested_df['Neighborhood Latitude'], interested_df['Neighborhood Longitude'], interested_df['Neighborhood'], interested_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=markers_colors[int(interested_df['Tier'][i])],
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_clusters)

    i+=1
map_clusters

In [472]:
map_clusters.save(outfile='possibleCandidates.html')