# The Battle of Neighborhoods

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

!conda install -c conda-forge geocoder --yes
import geocoder

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


# **Data**

### Geographical coordinates of NY neighboorhoods

The link https://geo.nyu.edu/catalog/nyu_2451_34572 cointains a dataset with latitude and logitude coordinates of each of the 306 neighborhoods the 5 boroughs of New York City.

In [3]:
# Download of the dataset as a geokson
!wget -q -O 'ny_data.json' https://geo.nyu.edu/download/file/nyu-2451-34572-geojson.json
    
with open('ny_data.json') as json_data:
    newyork_data = json.load(json_data)

Check the struture of the data

In [4]:
# check data structure
newyork_data.keys()

dict_keys(['type', 'totalFeatures', 'features', 'crs', 'bbox'])

The important information is contained in the field *features*

In [5]:
neighborhoods_data = newyork_data['features']
print("Number of neighborhoods: " + str(len(neighborhoods_data)))

Number of neighborhoods: 306


In [6]:
# Example of the data structure for a neighborhood
neighborhoods_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

Convert the information into a dataframe

In [7]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [8]:
# fill the dataframe
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [9]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


## Venues in New York City

Using Foursquare API to explore the venues of the New York neighborhoods.

In [33]:
# define Foursquare Credentials and Version
CLIENT_ID = "" # Foursquare ID
CLIENT_SECRET = "" # Foursquare Secret
VERSION = "" # Foursquare version

print('Foursquare credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Foursquare credentails:
CLIENT_ID: 
CLIENT_SECRET:


In [11]:
# Limit of venues per neighboorhood
LIMIT = 100

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    """Function to explore the top venues of a neighboorhood in the defined radius.
    """
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
ny_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [14]:
print(str(ny_venues.shape[0]) + " venues found by Foursquare API.")
ny_venues.head()

9826 venues found by Foursquare API.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
2,Wakefield,40.894705,-73.847201,Walgreens,40.896528,-73.8447,Pharmacy
3,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
4,Wakefield,40.894705,-73.847201,Dunkin',40.890459,-73.849089,Donut Shop


# **Methodology**

In [15]:
print('There are {} uniques categories.'.format(len(ny_venues['Venue Category'].unique())))

There are 427 uniques categories.


In [16]:
# Example of venues categories
ny_venues['Venue Category'].unique()[:40]

array(['Dessert Shop', 'Ice Cream Shop', 'Pharmacy', 'Donut Shop',
       'Gas Station', 'Sandwich Place', 'Food', 'Laundromat',
       'Pizza Place', 'Discount Store', 'Mattress Store', 'Bagel Shop',
       'Grocery Store', 'Fast Food Restaurant', 'Restaurant',
       'Baseball Field', 'Liquor Store', 'Bus Station', 'Park',
       'Caribbean Restaurant', 'Diner', 'Seafood Restaurant',
       'Deli / Bodega', 'Bowling Alley', 'Bus Stop', 'Automotive Shop',
       'Metro Station', 'Juice Bar', 'Chinese Restaurant',
       'Cosmetics Shop', 'Plaza', 'River', 'Business Service', 'Bank',
       'Food Truck', 'Home Service', 'Gym', 'Playground', 'Gourmet Shop',
       'Latin American Restaurant'], dtype=object)

## Select only venues of interest: park, playground or ice cream shop.

It will be important to exclude where there are already ice cream shops to have less competition. Also it will be important to select neighborhoods where there are playgrounds or parks to have more potential clients.

In [17]:
# select venues which are playgrounds, parks or ice cream shops
ny_venues = ny_venues[(ny_venues["Venue Category"] == "Playground")|(ny_venues["Venue Category"] == "Park")|(ny_venues["Venue Category"] == "Ice Cream Shop")].reset_index(drop=True)

In [18]:
print("There are {} venues which are playgrounds, parks or ice cream shops.".format(ny_venues.shape[0]))
ny_venues.head(10)

There are 387 venues which are playgrounds, parks or ice cream shops.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
1,Co-op City,40.874294,-73.829939,Baskin-Robbins,40.870045,-73.829578,Ice Cream Shop
2,Co-op City,40.874294,-73.829939,The Park,40.877645,-73.830836,Park
3,Riverdale,40.890834,-73.912585,Bell Tower Park,40.889178,-73.908331,Park
4,Riverdale,40.890834,-73.912585,Seton Park,40.887914,-73.916113,Park
5,Riverdale,40.890834,-73.912585,Spuyten Duyvil Playground,40.887227,-73.916058,Playground
6,Kingsbridge,40.881687,-73.902818,Carvel Ice Cream,40.883657,-73.901655,Ice Cream Shop
7,Marble Hill,40.876551,-73.91066,Baskin-Robbins,40.877149,-73.906658,Ice Cream Shop
8,Woodlawn,40.898273,-73.867315,Muskrat Cove,40.896615,-73.862446,Park
9,Woodlawn,40.898273,-73.867315,Woodlawn Playground,40.899877,-73.872243,Playground


In [19]:
# one hot encoding of the venues categories
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood and coordinates columns back to dataframe
ny_onehot['Neighborhood'] = ny_venues['Neighborhood']
ny_onehot['Neighborhood Latitude'] = ny_venues['Neighborhood Latitude']
ny_onehot['Neighborhood Longitude'] = ny_venues['Neighborhood Longitude']

ny_onehot.head()

Unnamed: 0,Ice Cream Shop,Park,Playground,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
0,1,0,0,Wakefield,40.894705,-73.847201
1,1,0,0,Co-op City,40.874294,-73.829939
2,0,1,0,Co-op City,40.874294,-73.829939
3,0,1,0,Riverdale,40.890834,-73.912585
4,0,1,0,Riverdale,40.890834,-73.912585


In [20]:
# move neighborhood and coordinates columns to the first column
without_neigh = ny_onehot.drop(["Neighborhood"], axis=1)
without_neigh = without_neigh.drop(["Neighborhood Latitude"], axis=1)
without_neigh = without_neigh.drop(["Neighborhood Longitude"], axis=1)

ny_onehot = pd.concat([ny_onehot['Neighborhood'], ny_onehot['Neighborhood Latitude'], ny_onehot['Neighborhood Longitude'], without_neigh], axis=1).reset_index(drop=True)
ny_onehot.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Ice Cream Shop,Park,Playground
0,Wakefield,40.894705,-73.847201,1,0,0
1,Co-op City,40.874294,-73.829939,1,0,0
2,Co-op City,40.874294,-73.829939,0,1,0
3,Riverdale,40.890834,-73.912585,0,1,0
4,Riverdale,40.890834,-73.912585,0,1,0


Group the venues by neighborhood

In [21]:
ny_grouped = ny_onehot.groupby('Neighborhood').mean().reset_index()
ny_grouped.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Ice Cream Shop,Park,Playground
0,Annadale,40.538114,-74.178549,0.0,1.0,0.0
1,Arverne,40.589144,-73.791992,0.0,0.0,1.0
2,Astoria,40.768509,-73.915654,1.0,0.0,0.0
3,Astoria Heights,40.770317,-73.89468,0.0,0.0,1.0
4,Bath Beach,40.599519,-73.998752,0.5,0.5,0.0


## Select neighborhoods of interest

There is interest in neighborhoods that do not have yet an ice cream shop. So it will be selected the neighborhoods with no ice cream shop.

In [22]:
ny_grouped = ny_grouped[ny_grouped["Ice Cream Shop"] == 0].reset_index(drop=True)

print("There are {} neighborhoods without an ice cream shop but with a park or a playground in NY.".format(ny_grouped.shape[0]))
ny_grouped.head()

There are 91 neighborhoods without an ice cream shop but with a park or a playground in NY.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Ice Cream Shop,Park,Playground
0,Annadale,40.538114,-74.178549,0.0,1.0,0.0
1,Arverne,40.589144,-73.791992,0.0,0.0,1.0
2,Astoria Heights,40.770317,-73.89468,0.0,0.0,1.0
3,Battery Park City,40.711932,-74.016869,0.0,0.75,0.25
4,Baychester,40.866858,-73.835798,0.0,0.0,1.0


In [23]:
# confirm that the dataframe does not include any neighborhod with an ice cream shop
ny_grouped["Ice Cream Shop"].max()

0.0

We can now exclude the Ice Cream Shop column from the dataframe

In [24]:
ny_grouped = ny_grouped.drop("Ice Cream Shop", axis=1).reset_index(drop=True)
ny_grouped.head(10)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Park,Playground
0,Annadale,40.538114,-74.178549,1.0,0.0
1,Arverne,40.589144,-73.791992,0.0,1.0
2,Astoria Heights,40.770317,-73.89468,0.0,1.0
3,Battery Park City,40.711932,-74.016869,0.75,0.25
4,Baychester,40.866858,-73.835798,0.0,1.0
5,Bayswater,40.611322,-73.765968,0.5,0.5
6,Bedford Park,40.870185,-73.885512,1.0,0.0
7,Bedford Stuyvesant,40.687232,-73.941785,1.0,0.0
8,Bergen Beach,40.61515,-73.898556,0.0,1.0
9,Bloomfield,40.605779,-74.187256,1.0,0.0


## Cluster the neighborhoods

The neighborhoods can be clustered in 3 ways accordingly with the current problem: neighborhoods with park, neighborhoods with playgrounds or neighborhoods with both.
    
We will use the k-means algorithm to do this clustering.

In [25]:
# set number of clusters
kclusters = 3

ny_grouped_clustering = ny_grouped.drop('Neighborhood', 1)
ny_grouped_clustering = ny_grouped_clustering.drop('Neighborhood Latitude', 1)
ny_grouped_clustering = ny_grouped_clustering.drop('Neighborhood Longitude', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ny_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 1, 2, 1, 2, 0, 0, 1, 0], dtype=int32)

In [26]:
ny_grouped_clustering.insert(0, "Cluster Labels", kmeans.labels_)
ny_merged = pd.concat([ny_grouped[["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude"]], ny_grouped_clustering], axis=1)

In [27]:
ny_merged.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Cluster Labels,Park,Playground
0,Annadale,40.538114,-74.178549,0,1.0,0.0
1,Arverne,40.589144,-73.791992,1,0.0,1.0
2,Astoria Heights,40.770317,-73.89468,1,0.0,1.0
3,Battery Park City,40.711932,-74.016869,2,0.75,0.25
4,Baychester,40.866858,-73.835798,1,0.0,1.0


## **Results**

In [28]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of New York City are 40.7127281, -74.0060152.


In [29]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
colors = ["turquoise", "deepskyblue", "red"]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['Neighborhood Latitude'], ny_merged['Neighborhood Longitude'], ny_merged['Neighborhood'], ny_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=colors[cluster],
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

All neighborhoods in the results do not have any ice cream shop.

Cluster 0 (turquoise): neighborhoods with parks

In [30]:
c0 = ny_merged.loc[ny_merged["Cluster Labels"] == 0, ["Neighborhood", "Cluster Labels", "Park", "Playground"]].reset_index(drop=True)

print("There are {} neighborhoods with parks but without playgrounds".format(c0.shape[0]))

c0.head()

There are 48 neighborhoods with parks but without playgrounds


Unnamed: 0,Neighborhood,Cluster Labels,Park,Playground
0,Annadale,0,1.0,0.0
1,Bedford Park,0,1.0,0.0
2,Bedford Stuyvesant,0,1.0,0.0
3,Bloomfield,0,1.0,0.0
4,Central Harlem,0,1.0,0.0


Cluster 1 (blue): neighborhoods with playground

In [31]:
c1 = ny_merged.loc[ny_merged["Cluster Labels"] == 1, ["Neighborhood", "Cluster Labels", "Park", "Playground"]].reset_index(drop=True)

print("There are {} neighborhoods with playgrounds but without parks".format(c1.shape[0]))

c1.head()

There are 24 neighborhoods with playgrounds but without parks


Unnamed: 0,Neighborhood,Cluster Labels,Park,Playground
0,Arverne,1,0.0,1.0
1,Astoria Heights,1,0.0,1.0
2,Baychester,1,0.0,1.0
3,Bergen Beach,1,0.0,1.0
4,Briarwood,1,0.0,1.0


**Cluster 2 (red)**: neighborhoods with park and playground

In [32]:
c2 = ny_merged.loc[ny_merged["Cluster Labels"] == 2, ["Neighborhood", "Cluster Labels", "Park", "Playground"]].reset_index(drop=True)

print("There are {} neighborhoods with parks and playgrounds".format(c2.shape[0]))

c2.head()

There are 19 neighborhoods with parks and playgrounds


Unnamed: 0,Neighborhood,Cluster Labels,Park,Playground
0,Battery Park City,2,0.75,0.25
1,Bayswater,2,0.5,0.5
2,Brownsville,2,0.666667,0.333333
3,Fort Greene,2,0.333333,0.666667
4,Inwood,2,0.666667,0.333333


# **Conclusion**

Accordingly with the data provided by the Foursquare APi and https://geo.nyu.edu/catalog/nyu_2451_34572, there are **19 neighborhoods** in New York City where an ice cream shop can be open. These neighborhoods are represented in red in the map. The criteria was the selection of neighborhoods without an ice cream shop but that have recreation places such as parks or playgrounds.
Further studies can be done to restrict even more this number, such as, excluding places with desserts shops already or selecting a borough with higher number of recreation places.