## Capstone Project – Battle between two North American cities (Toronto V/S New York)

#### 1. Importing the required libraries.

In [1]:
import pandas as pd
import requests
import numpy as np
import geocoder
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import json
import xml
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim 
from bs4 import BeautifulSoup

#### 2. Data Extraction and Cleaning

In [2]:
import requests

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki_url = requests.get(url)
wiki_url

<Response [200]>

In [3]:
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
table_contents=[]
source = requests.get(url).text
soup = BeautifulSoup(source)

table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if(row.span.text)=="Not assigned":
        pass
    else:
        #print(row.p.text[:3])
        #print((row.span.text).split('(')[0])
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).
                                 replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

#print(table_contents)
df = pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
            'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

df.head()    

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


**To check if there is any NULL values in the DataFrame**

In [4]:
df.isna().sum()

PostalCode      0
Borough         0
Neighborhood    0
dtype: int64

In [5]:
geo_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'
df_geo = pd.read_csv(geo_url)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
df_combined = df.join(df_geo.set_index('Postal Code'), on='PostalCode', how='inner')
df_combined

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [7]:
df_combined['Borough'].value_counts()

North York                24
Downtown Toronto          17
Scarborough               17
Etobicoke                 11
Central Toronto            9
West Toronto               6
York                       5
East York                  4
East Toronto               4
Mississauga                1
East York/East Toronto     1
Etobicoke Northwest        1
East Toronto Business      1
Queen's Park               1
Downtown Toronto Stn A     1
Name: Borough, dtype: int64

#### 3. Getting the latitude and longitude of "Toronto" and populating it's map

In [8]:
from geopy.geocoders import Nominatim 
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [9]:
import folium

# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_combined['Latitude'], df_combined['Longitude'], df_combined['Borough'], df_combined['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        ).add_to(map_Toronto)  
    
map_Toronto

In [10]:
CLIENT_ID = 'HCAIF02KOVJJPHNPQOXUZXVPPWPOH3X123XOAPMNFDPY50ON' # your Foursquare ID
CLIENT_SECRET = 'WTEGCK5G54FMWAVBLHQBSWX5FZJEJCMBHK4GBRHEC54P3UCZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 50

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HCAIF02KOVJJPHNPQOXUZXVPPWPOH3X123XOAPMNFDPY50ON
CLIENT_SECRET:WTEGCK5G54FMWAVBLHQBSWX5FZJEJCMBHK4GBRHEC54P3UCZ


**Fetching the details about the nearby venues in the city**

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=700):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # making GET request
        venue_results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in venue_results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
Toronto_venues = getNearbyVenues(names=df_combined['Neighborhood'],
                                   latitudes=df_combined['Latitude'],
                                   longitudes=df_combined['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills North
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview East
The Danforth

In [13]:
Toronto_venues.head(7)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,PetSmart,43.748639,-79.333488,Pet Store
2,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
3,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
4,Parkwoods,43.753259,-79.329656,The Bing Suites,43.747816,-79.33219,Bed & Breakfast
5,Parkwoods,43.753259,-79.329656,Joey,43.753441,-79.32164,Burger Joint
6,Parkwoods,43.753259,-79.329656,Three Valleys Park,43.751195,-79.337356,Park


In [14]:
print('There are {} Uniques Categories of Venues in Toronto.'.format(len(Toronto_venues['Venue Category'].unique())))
Toronto_venues.groupby(["Neighborhood"])['Venue Category'].value_counts()

There are 296 Uniques Categories of Venues in Toronto.


Neighborhood     Venue Category           
Agincourt        Badminton Court              1
                 Breakfast Spot               1
                 Clothing Store               1
                 Coffee Shop                  1
                 Latin American Restaurant    1
                                             ..
York Mills West  Bowling Alley                1
                 Convenience Store            1
                 Intersection                 1
                 Pet Store                    1
                 Tennis Court                 1
Name: Venue Category, Length: 1881, dtype: int64

**grouping on Neighborhood just to get one distinct neighborhood of each**

In [15]:
Toronto_venues.shape

(2480, 7)

In [16]:
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep=" ")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
Toronto_grp = Toronto_onehot.groupby(['Neighborhood']).mean().reset_index()
Toronto_grp.head()


Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [19]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
Toronto_neighborhoods_top10 = pd.DataFrame(columns=columns)
Toronto_neighborhoods_top10['Neighborhood'] = Toronto_grp['Neighborhood']

for ind in np.arange(Toronto_grp.shape[0]):
    Toronto_neighborhoods_top10.iloc[ind, 1:] = return_most_common_venues(Toronto_grp.iloc[ind, :], num_top_venues)

Toronto_neighborhoods_top10.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Skating Rink,Pool Hall,Newsagent,Latin American Restaurant,Badminton Court,Clothing Store,Coffee Shop,Sandwich Place,Lounge,Breakfast Spot
1,"Alderwood, Long Branch",Pizza Place,Convenience Store,Gym,Coffee Shop,Pub,Sandwich Place,Skating Rink,Gas Station,Donut Shop,Discount Store
2,"Bathurst Manor, Wilson Heights, Downsview North",Park,Coffee Shop,Bank,Pet Store,Fried Chicken Joint,Bridal Shop,Shopping Mall,Middle Eastern Restaurant,Mobile Phone Shop,Supermarket
3,Bayview Village,Bank,Playground,Café,Japanese Restaurant,Grocery Store,Intersection,Chinese Restaurant,Yoga Studio,Drugstore,Dog Run
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Sandwich Place,Grocery Store,Bagel Shop,Bakery,Bank,Sushi Restaurant,Juice Bar,Liquor Store


In [20]:
Toronto_neighborhoods_top10[
    Toronto_neighborhoods_top10['1st Most Common Venue'].str.contains("coffee", case=False)
    & Toronto_neighborhoods_top10['2nd Most Common Venue'].str.contains("restaurant", case=False)
]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Cedarbrae,Coffee Shop,Indian Restaurant,Thai Restaurant,Bakery,Caribbean Restaurant,Fried Chicken Joint,Chinese Restaurant,Athletics & Sports,Gas Station,Lounge
78,"St. James Town, Cabbagetown",Coffee Shop,Restaurant,Café,Park,Pub,Italian Restaurant,Japanese Restaurant,Diner,Bakery,Gastropub
81,"Summerhill West, Rathnelly, South Hill, Forest...",Coffee Shop,Italian Restaurant,Sushi Restaurant,Pizza Place,Thai Restaurant,Grocery Store,Restaurant,Café,Bank,Bagel Shop
95,Willowdale South,Coffee Shop,Ramen Restaurant,Pizza Place,Korean Restaurant,Sandwich Place,Café,Japanese Restaurant,Middle Eastern Restaurant,Bank,Dessert Shop


#### 4. Clustering the Neighbourhood

In [21]:
kclusters = 5

Toronto_X = Toronto_grp.drop('Neighborhood', axis=1)

#fitting the K-Means algo.
kmeans = KMeans(n_clusters=kclusters, random_state=3).fit(Toronto_X)
kmeans.labels_[0:10]

array([3, 1, 3, 1, 3, 3, 3, 3, 3, 0])

In [22]:
Toronto_neighborhoods_top10.insert(1,'Cluster Labels', kmeans.labels_)
#Toronto_neighborhoods_top10.drop('Neighborhood', axis=1, inplace=True)

Toronto_comp = df_combined
Toronto_comp = Toronto_comp.join(Toronto_neighborhoods_top10.set_index('Neighborhood'), 
                                 on='Neighborhood',
                                how='inner')
Toronto_comp.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Park,Fast Food Restaurant,Food & Drink Shop,Pet Store,Bed & Breakfast,Burger Joint,Donut Shop,Drugstore,Dry Cleaner,Dumpling Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,3,Playground,Hockey Arena,Pizza Place,Intersection,Park,Coffee Shop,Sporting Goods Shop,Portuguese Restaurant,Discount Store,Comic Shop
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3,Coffee Shop,Bakery,Café,Theater,Pub,Park,Breakfast Spot,Performing Arts Venue,Yoga Studio,Shoe Store
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,3,Clothing Store,Furniture / Home Store,Fast Food Restaurant,Vietnamese Restaurant,Coffee Shop,Accessories Store,Bowling Alley,Boutique,Men's Store,Seafood Restaurant
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494,3,Coffee Shop,Burrito Place,Sandwich Place,Japanese Restaurant,Park,Sushi Restaurant,Yoga Studio,Department Store,Spa,Bookstore


In [23]:
Toronto_comp[
    Toronto_comp['1st Most Common Venue'].str.contains("coffee", case=False)
    & Toronto_comp['2nd Most Common Venue'].str.contains("restaurant", case=False)
]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
26,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,3,Coffee Shop,Indian Restaurant,Thai Restaurant,Bakery,Caribbean Restaurant,Fried Chicken Joint,Chinese Restaurant,Athletics & Sports,Gas Station,Lounge
59,M2N,North York,Willowdale South,43.77012,-79.408493,3,Coffee Shop,Ramen Restaurant,Pizza Place,Korean Restaurant,Sandwich Place,Café,Japanese Restaurant,Middle Eastern Restaurant,Bank,Dessert Shop
86,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,3,Coffee Shop,Italian Restaurant,Sushi Restaurant,Pizza Place,Thai Restaurant,Grocery Store,Restaurant,Café,Bank,Bagel Shop
96,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,3,Coffee Shop,Restaurant,Café,Park,Pub,Italian Restaurant,Japanese Restaurant,Diner,Bakery,Gastropub


In [24]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_comp['Latitude'], Toronto_comp['Longitude'], Toronto_comp['Neighborhood'], Toronto_comp['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Exploring the NY data.
**For New York we will be creating the Data Frame from a Json file.**

In [25]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

neighborhoods_data = newyork_data['features']

column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
# instantiate the dataframe
NY_neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    NY_neighborhoods = NY_neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

NY_neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [26]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [27]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10.5)

# add markers to map
for lat, lng, borough, neighborhood in zip(NY_neighborhoods['Latitude'], NY_neighborhoods['Longitude'], NY_neighborhoods['Borough'], NY_neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='magenta',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [28]:
NY_venues = getNearbyVenues(names=NY_neighborhoods['Neighborhood'],
                                   latitudes=NY_neighborhoods['Latitude'],
                                   longitudes=NY_neighborhoods['Longitude']
                                  )

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [29]:
NY_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
2,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
3,Wakefield,40.894705,-73.847201,Walgreens,40.896528,-73.8447,Pharmacy
4,Wakefield,40.894705,-73.847201,Jackie's West Indian Bakery,40.889283,-73.84331,Caribbean Restaurant


In [30]:
print('There are {} Uniques Categories of Venues in New York.'.format(len(NY_venues['Venue Category'].unique())))
print(NY_venues.shape)

There are 440 Uniques Categories of Venues in New York.
(10655, 7)


New York is a much bigger city comapred to Toronto and lot of unique venues also as compared to that of Toronto. But we will categorise them with top 10 venues avaialble in a neighborhood.

In [31]:
NY_onehot = pd.get_dummies(NY_venues[['Venue Category']], prefix="", prefix_sep=" ")

# add neighborhood column back to dataframe
NY_onehot['Neighborhood'] = NY_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [NY_onehot.columns[-1]] + list(NY_onehot.columns[:-1])
NY_onehot = NY_onehot[fixed_columns]

NY_onehot.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yemeni Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
NY_grp = NY_onehot.groupby(['Neighborhood']).mean().reset_index()
NY_grp.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yemeni Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Allerton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Annadale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arden Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arlington,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arrochar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
NY_neighborhoods_top10 = pd.DataFrame(columns=columns)
NY_neighborhoods_top10['Neighborhood'] = NY_grp['Neighborhood']

for ind in np.arange(NY_grp.shape[0]):
    NY_neighborhoods_top10.iloc[ind, 1:] = return_most_common_venues(NY_grp.iloc[ind, :], num_top_venues)

NY_neighborhoods_top10.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Allerton,Donut Shop,Pizza Place,Fast Food Restaurant,Sandwich Place,Pharmacy,Martial Arts School,Supermarket,Bar,Bus Station,Discount Store
1,Annadale,Pizza Place,Restaurant,American Restaurant,Food,Train Station,Diner,Liquor Store,Pharmacy,Playground,Bar
2,Arden Heights,Elementary School,Pharmacy,Deli / Bodega,Coffee Shop,Home Service,Pizza Place,Zoo Exhibit,Fast Food Restaurant,Event Service,Event Space
3,Arlington,Boat or Ferry,Deli / Bodega,Bus Stop,American Restaurant,Playground,General Entertainment,Coffee Shop,Home Service,Fast Food Restaurant,Field
4,Arrochar,Deli / Bodega,Baseball Field,Bus Stop,Italian Restaurant,Pizza Place,Beach,Sculpture Garden,Martial Arts School,Sandwich Place,Liquor Store


In [34]:
NY_neighborhoods_top10[
    NY_neighborhoods_top10['1st Most Common Venue'].str.contains("coffee", case=False)
    & NY_neighborhoods_top10['2nd Most Common Venue'].str.contains("restaurant", case=False)
]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
44,Carroll Gardens,Coffee Shop,Italian Restaurant,Bakery,Pizza Place,Wine Bar,Bar,Thai Restaurant,Cocktail Bar,French Restaurant,Filipino Restaurant
73,Downtown,Coffee Shop,Chinese Restaurant,Grocery Store,Sandwich Place,Bookstore,Big Box Store,Steakhouse,Brewery,Bubble Tea Shop,Cocktail Bar
124,Hamilton Heights,Coffee Shop,Mexican Restaurant,Caribbean Restaurant,Café,Bakery,School,Cocktail Bar,Bar,Park,Yoga Studio
165,Manhattanville,Coffee Shop,Italian Restaurant,Mexican Restaurant,Seafood Restaurant,Chinese Restaurant,Park,Café,Bar,Latin American Restaurant,Boutique


In [35]:
kclusters = 5

NY_X = NY_grp.drop('Neighborhood', axis=1)

#fitting the K-Means algo.
kmeans = KMeans(n_clusters=kclusters, random_state=3).fit(NY_X)
kmeans.labels_[0:10]

array([1, 1, 1, 4, 1, 1, 4, 4, 1, 4])

In [36]:
NY_neighborhoods_top10.insert(1,'Cluster Labels', kmeans.labels_)

NY_comp = NY_neighborhoods
NY_comp = NY_comp.join(NY_neighborhoods_top10.set_index('Neighborhood'), 
                                 on='Neighborhood',
                                how='inner')
NY_comp.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bronx,Wakefield,40.894705,-73.847201,1,Caribbean Restaurant,Pharmacy,Fried Chicken Joint,Ice Cream Shop,Donut Shop,Fast Food Restaurant,Laundromat,Gas Station,Sandwich Place,Dessert Shop
1,Bronx,Co-op City,40.874294,-73.829939,1,Fast Food Restaurant,Restaurant,Post Office,Chinese Restaurant,Electronics Store,Trail,Bagel Shop,Grocery Store,Seafood Restaurant,Basketball Court
2,Bronx,Eastchester,40.887556,-73.827806,1,Caribbean Restaurant,Fast Food Restaurant,Deli / Bodega,Diner,Grocery Store,Auto Garage,Cocktail Bar,Automotive Shop,Nightclub,Chinese Restaurant
3,Bronx,Fieldston,40.895437,-73.905643,4,Plaza,Art Gallery,Café,River,Bus Station,Home Service,Event Space,Exhibit,Eye Doctor,Factory
4,Bronx,Riverdale,40.890834,-73.912585,4,Bank,Japanese Restaurant,Mexican Restaurant,Bagel Shop,Pizza Place,Park,Diner,Pharmacy,Thai Restaurant,Tapas Restaurant


In [37]:
NY_comp[
    NY_comp['1st Most Common Venue'].str.contains("coffee", case=False)
    & NY_comp['2nd Most Common Venue'].str.contains("restaurant", case=False)
]

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
66,Brooklyn,Carroll Gardens,40.68054,-73.994654,4,Coffee Shop,Italian Restaurant,Bakery,Pizza Place,Wine Bar,Bar,Thai Restaurant,Cocktail Bar,French Restaurant,Filipino Restaurant
86,Brooklyn,Downtown,40.690844,-73.983463,4,Coffee Shop,Chinese Restaurant,Grocery Store,Sandwich Place,Bookstore,Big Box Store,Steakhouse,Brewery,Bubble Tea Shop,Cocktail Bar
103,Manhattan,Hamilton Heights,40.823604,-73.949688,4,Coffee Shop,Mexican Restaurant,Caribbean Restaurant,Café,Bakery,School,Cocktail Bar,Bar,Park,Yoga Studio
104,Manhattan,Manhattanville,40.816934,-73.957385,4,Coffee Shop,Italian Restaurant,Mexican Restaurant,Seafood Restaurant,Chinese Restaurant,Park,Café,Bar,Latin American Restaurant,Boutique


In [38]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10.5)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(NY_comp['Latitude'], NY_comp['Longitude'], NY_comp['Neighborhood'], NY_comp['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-2],
        fill=True,
        fill_color=rainbow[cluster-2],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters