 # Capstone Project - The Battle of Neighborhoods

## Import Libraries

In this section we import the libraries that will be required to process the data.

In [143]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!pip install geopy
!pip install wget
from geopy.geocoders import Nominatim
import urllib.request
import json
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
%matplotlib inline
from sklearn.cluster import KMeans

import folium




## Download and Explore Dataset


Download and Explore Dataset
Neighborhood has a total of 5 boroughs and 306 neighborhoods. In order to segement the neighborhoods and explore them, we will essentially need a dataset that contains the 5 boroughs and the neighborhoods that exist in each borough as well as the the latitude and logitude coordinates of each neighborhood.

The link to the dataset: https://cocl.us/new_york_dataset

In [144]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

Data downloaded!


#### Tranform the data into a *pandas* dataframe

In [145]:
neighborhoods_data = newyork_data['features']

column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [146]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


#### Use of geopy library to get the latitude and longitude values of New York City.

In [147]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


#### Map of New York with neighborhoods superimposed on top.

In [148]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

import folium
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, borough, neighborhood in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Borough'], manhattan_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

## Foursquare venues


In [149]:
import urllib
def getNearbyVenues(names, latitudes, longitudes, radius=5000, categoryIds=''):
    try:
        venues_list=[]
        for name, lat, lng in zip(names, latitudes, longitudes):

            url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)

            if (categoryIds != ''):
                url = url + '&categoryId={}'
                url = url.format(categoryIds)

            response = requests.get(url).json()
            results = response["response"]['venues']

            for v in results:
                success = False
                try:
                    category = v['categories'][0]['name']
                    success = True
                except:
                    pass

                if success:
                    venues_list.append([(
                        name, 
                        lat, 
                        lng, 
                        v['name'], 
                        v['location']['lat'], 
                        v['location']['lng'],
                        v['categories'][0]['name']
                    )])

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',  
                  'Venue Category']
    
    except:
        print(url)
        print(response)
        print(results)
        print(nearby_venues)

    return(nearby_venues)

In [150]:
LIMIT = 500 
radius = 5000 
CLIENT_ID = '13AZBAZPH2RPYU0AHBRPS0SPZGQFJNP5UVCXFBAG2SO1XFVE'
CLIENT_SECRET = '1B3QNU1V2OHCKLCIKTPCPKS4DIODB3OMUBQMH3Q2ACDHE04W'
VERSION = '20181020'

In [151]:
neighborhoods = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
newyork_venues_thai = getNearbyVenues(names=neighborhoods['Neighborhood'], latitudes=neighborhoods['Latitude'], longitudes=neighborhoods['Longitude'], radius=1000, categoryIds='4bf58dd8d48988d149941735')
newyork_venues_thai.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Siam Square,40.878796,-73.916701,Thai Restaurant
1,Marble Hill,40.876551,-73.91066,Nam Thai,40.886388,-73.910025,Thai Restaurant
2,Chinatown,40.715618,-73.994279,Noree Thai Bazaar,40.7179,-73.992966,Thai Restaurant
3,Chinatown,40.715618,-73.994279,Wayla,40.718291,-73.992584,Thai Restaurant
4,Chinatown,40.715618,-73.994279,Eat Gai,40.717773,-73.98829,Thai Restaurant


In [152]:
newyork_venues_thai.shape

(1011, 7)

In [153]:
def addToMap(df, color, existingMap):
    for lat, lng, local, venue, venueCat in zip(df['Venue Latitude'], df['Venue Longitude'], df['Neighborhood'], df['Venue'], df['Venue Category']):
        label = '{} ({}) - {}'.format(venue, venueCat, local)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7).add_to(existingMap)

In [154]:
map_newyork_thai = folium.Map(location=[latitude, longitude], zoom_start=10)
addToMap(newyork_venues_thai, 'red', map_newyork_thai)

map_newyork_thai

In [155]:
def addColumn(startDf, columnTitle, dataDf):
    grouped = dataDf.groupby('Neighborhood').count()
    
    for n in startDf['Neighborhood']:
        try:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = grouped.loc[n, 'Venue']
        except:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = 0

In [156]:
manhattan_grouped = newyork_venues_thai.groupby('Neighborhood').count()
print(manhattan_grouped)

print('There are {} uniques categories.'.format(len(newyork_venues_thai['Venue Category'].unique())))

                     Neighborhood Latitude  Neighborhood Longitude  Venue  \
Neighborhood                                                                
Battery Park City                       10                      10     10   
Carnegie Hill                           25                      25     25   
Central Harlem                           6                       6      6   
Chelsea                                 31                      31     31   
Chinatown                               36                      36     36   
Civic Center                            23                      23     23   
Clinton                                 48                      48     48   
East Harlem                             10                      10     10   
East Village                            49                      49     49   
Financial District                      11                      11     11   
Flatiron                                46                      46     46   

## 3. Analyze Each Neighborhood

In [157]:
# one hot encoding
manhattan_onehot = pd.get_dummies(newyork_venues_thai[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = newyork_venues_thai['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_onehot.head()

Unnamed: 0,Neighborhood,Asian Restaurant,Chinese Restaurant,Filipino Restaurant,Food Truck,Indian Restaurant,Japanese Restaurant,Malay Restaurant,Ramen Restaurant,Sushi Restaurant,Thai Restaurant,Vietnamese Restaurant,Wine Bar
0,Marble Hill,0,0,0,0,0,0,0,0,0,1,0,0
1,Marble Hill,0,0,0,0,0,0,0,0,0,1,0,0
2,Chinatown,0,0,0,0,0,0,0,0,0,1,0,0
3,Chinatown,0,0,0,0,0,0,0,0,0,1,0,0
4,Chinatown,0,0,0,0,0,0,0,0,0,1,0,0


In [158]:
manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()
manhattan_grouped

Unnamed: 0,Neighborhood,Asian Restaurant,Chinese Restaurant,Filipino Restaurant,Food Truck,Indian Restaurant,Japanese Restaurant,Malay Restaurant,Ramen Restaurant,Sushi Restaurant,Thai Restaurant,Vietnamese Restaurant,Wine Bar
0,Battery Park City,0.0,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.0,0.8,0.0,0.0
1,Carnegie Hill,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.96,0.0,0.0
2,Central Harlem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,Chelsea,0.0,0.032258,0.0,0.032258,0.0,0.032258,0.032258,0.0,0.0,0.870968,0.0,0.0
4,Chinatown,0.083333,0.0,0.027778,0.0,0.0,0.0,0.027778,0.0,0.027778,0.805556,0.027778,0.0
5,Civic Center,0.043478,0.0,0.0,0.043478,0.0,0.043478,0.043478,0.0,0.0,0.826087,0.0,0.0
6,Clinton,0.020833,0.020833,0.0,0.041667,0.0,0.0,0.0,0.041667,0.0,0.875,0.0,0.0
7,East Harlem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,East Village,0.040816,0.040816,0.020408,0.020408,0.0,0.0,0.0,0.0,0.040816,0.795918,0.020408,0.020408
9,Financial District,0.0,0.0,0.0,0.090909,0.0,0.090909,0.0,0.0,0.0,0.818182,0.0,0.0


In [159]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [160]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(7)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Battery Park City,Thai Restaurant,Japanese Restaurant,Food Truck,Wine Bar,Vietnamese Restaurant,Sushi Restaurant,Ramen Restaurant,Malay Restaurant,Indian Restaurant,Filipino Restaurant
1,Carnegie Hill,Thai Restaurant,Asian Restaurant,Wine Bar,Vietnamese Restaurant,Sushi Restaurant,Ramen Restaurant,Malay Restaurant,Japanese Restaurant,Indian Restaurant,Food Truck
2,Central Harlem,Thai Restaurant,Wine Bar,Vietnamese Restaurant,Sushi Restaurant,Ramen Restaurant,Malay Restaurant,Japanese Restaurant,Indian Restaurant,Food Truck,Filipino Restaurant
3,Chelsea,Thai Restaurant,Malay Restaurant,Japanese Restaurant,Food Truck,Chinese Restaurant,Wine Bar,Vietnamese Restaurant,Sushi Restaurant,Ramen Restaurant,Indian Restaurant
4,Chinatown,Thai Restaurant,Asian Restaurant,Vietnamese Restaurant,Sushi Restaurant,Malay Restaurant,Filipino Restaurant,Wine Bar,Ramen Restaurant,Japanese Restaurant,Indian Restaurant
5,Civic Center,Thai Restaurant,Malay Restaurant,Japanese Restaurant,Food Truck,Asian Restaurant,Wine Bar,Vietnamese Restaurant,Sushi Restaurant,Ramen Restaurant,Indian Restaurant
6,Clinton,Thai Restaurant,Ramen Restaurant,Food Truck,Chinese Restaurant,Asian Restaurant,Wine Bar,Vietnamese Restaurant,Sushi Restaurant,Malay Restaurant,Japanese Restaurant


Cluster Neighborhoods


In [161]:
kclusters = 7

manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

kmeans.labels_[0:10] 

array([5, 1, 1, 6, 0, 6, 3, 1, 0, 5])

In [None]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = manhattan_data
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

manhattan_merged.head() 
manhattan_merged.dropna(axis=1)

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

In [None]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 1, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]


In [None]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 2, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]


In [None]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 3, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]


In [None]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 4 , manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]


In [None]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 5 , manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]


In [None]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 6 , manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]
