Install all Packages and Libraries!

In [54]:
!pip install lxml
!pip install pandasql
!pip install geopy
import random # library for random number generation
from pandasql import sqldf
import numpy as np 
import pandas as pd 
import folium
import json # library to handle json files
from pandas.io.json import json_normalize
from geopy.geocoders import Nominatim 
import requests
import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

print('Libraries imported.')



Libraries imported.


# Now let's load and read the data.

In [55]:
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dataset_list = pd.read_html(url, header=0)

In [56]:
# Get the first table in the list of dataframes. Assign a name to it. 
postal_codes_df = dataset_list[0]

In [57]:
# View the first 10 records of the data frame before cleaning
postal_codes_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [58]:
# Let's check the data frame's dimensions before we clean the data
postal_codes_df.shape

(288, 3)

# Cleaning the Data

Now let's clean up the data set!

I only want rows with an assigned Borough, so I want to filter out all the rows with 'Not assigned' with pandas.

In [59]:
postal_codes_df = postal_codes_df.query("Borough != 'Not assigned'")

In [60]:
postal_codes_df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


There are still some 'Not assigned in the Neighbourhood' column. Let's organise the data by Postcode, Borough and Neightbourhood and fix any leftover 'Not assigned':

In [61]:
postal_codes_df = postal_codes_df.groupby(['Postcode','Borough'], as_index = False, sort = False)
postal_codes_df = postal_codes_df.agg(', '.join)
postal_codes_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [62]:
postal_codes_df.query("Neighbourhood == 'Not assigned'")

Unnamed: 0,Postcode,Borough,Neighbourhood
4,M7A,Queen's Park,Not assigned


There's only one row where Neighbourhood is 'Not assigned'. I want to replace that with the Borough name instead:

In [63]:
postal_codes_df.loc[postal_codes_df.Neighbourhood == 'Not assigned','Neighbourhood'] = "Queen's Park"

In [64]:
postal_codes_df.shape

(103, 3)

Now I want to use geocode to get the coordinates for the postal codes:

In [65]:
# To use a dataset from a raw github file
url = 'https://raw.githubusercontent.com/jsphyg/test_data/master/Geospatial_Coordinates.csv'

geocode_df=pd.read_csv(url)
geocode_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Using SQL, I'm going to join my new dataframe of coordinates with my original clean Toronto data set:

In [66]:
query = """
    SELECT 
        p.*, Latitude, Longitude
    FROM postal_codes_df as p
    INNER JOIN geocode_df as g ON p.PostCode = g.'Postal Code'
"""
postal_codes_df = sqldf(query)
postal_codes_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [67]:
postal_codes_df.shape

(103, 5)

Let's visualise the data using folium maps:

In [68]:
postal_map = folium.Map(location=[43.753259, -79.329656], zoom_start = 11)

for i in range(0,len(postal_codes_df)):
    folium.Marker([postal_codes_df.iloc[i]['Latitude'], postal_codes_df.iloc[i]['Longitude']]).add_to(postal_map)
    
postal_map

I want to see all the Boroughs where 'Toronto' is in the name:

In [69]:
query = """
    SELECT 
        *
    FROM postal_codes_df 
    WHERE Borough like '%Toronto%'
"""
Toronto_df = sqldf(query)
Toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


Let's compare that to all the Boroughs that don't include the name 'Toronto':

In [70]:
query = """
    SELECT 
        *
    FROM postal_codes_df 
    WHERE Borough not like '%Toronto%'
"""
not_Toronto_df = sqldf(query)
not_Toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
3,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242


Using my Foursquare credentials, I'm going to creat a new dataframe with just Toronto-named Boroughs and compare 
the area information:

In [71]:
CLIENT_ID = 'M3IDXSOOZWUIIRWRCND3AXKSZYRJWJJEMRKEBTKMOIVVBXA2' # your Foursquare ID
CLIENT_SECRET = 'G0FJEHYLIDPREQ2QOIM2MANQC2KGR25PTNPCM3TGVZVNG4NK' # your Foursquare Secret
VERSION = '20180604'

In [72]:
Toronto_subset = Toronto_df.drop_duplicates(['Borough'], keep = 'last')
Toronto_subset

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
27,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445
30,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
36,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
37,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558


In [73]:
neighborhood_latitude = Toronto_subset['Latitude'].iloc[0] 
neighborhood_longitude = Toronto_subset['Longitude'].iloc[0] 
LIMIT = 100  # limit seems to be 100 rows anyway
radius = 1000 # this is in meters. 5 miles equals 8046.72 meters

neighborhood_latitude

43.6515706

In [74]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

results = requests.get(url).json()

url

'https://api.foursquare.com/v2/venues/explore?&client_id=M3IDXSOOZWUIIRWRCND3AXKSZYRJWJJEMRKEBTKMOIVVBXA2&client_secret=G0FJEHYLIDPREQ2QOIM2MANQC2KGR25PTNPCM3TGVZVNG4NK&v=20180604&ll=43.6515706,-79.4844499&radius=1000&limit=100'

In [75]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)
nearby_venues.head()

Unnamed: 0,referralId,reasons.count,reasons.items,venue.id,venue.name,venue.location.address,venue.location.crossStreet,venue.location.lat,venue.location.lng,venue.location.labeledLatLngs,...,venue.location.cc,venue.location.city,venue.location.state,venue.location.country,venue.location.formattedAddress,venue.categories,venue.photos.count,venue.photos.groups,venue.venuePage.id,venue.location.neighborhood
0,e-0-4e5ffdf962e13e3bcd932a0a-0,0,"[{'summary': 'This spot is popular', 'type': '...",4e5ffdf962e13e3bcd932a0a,The Good Fork,2432 Bloor St. W,Jane,43.649565,-79.484023,"[{'label': 'display', 'lat': 43.64956534036813...",...,CA,Toronto,ON,Canada,"[2432 Bloor St. W (Jane), Toronto ON M6S 1P9, ...","[{'id': '4d4b7105d754a06374d81259', 'name': 'F...",0,[],32831185.0,
1,e-0-4b15aef7f964a52074b223e3-1,0,"[{'summary': 'This spot is popular', 'type': '...",4b15aef7f964a52074b223e3,Asa Sushi,18 Jane St.,Bloor,43.649902,-79.484611,"[{'label': 'display', 'lat': 43.64990164450559...",...,CA,Toronto,ON,Canada,"[18 Jane St. (Bloor), Toronto ON, Canada]","[{'id': '4bf58dd8d48988d1d2941735', 'name': 'S...",0,[],,
2,e-0-51f2a052498e3c69bf38ee85-2,0,"[{'summary': 'This spot is popular', 'type': '...",51f2a052498e3c69bf38ee85,The One That Got Away,2392 Bloor St W,,43.649842,-79.482615,"[{'label': 'display', 'lat': 43.64984188412754...",...,CA,Toronto,ON,Canada,"[2392 Bloor St W, Toronto ON, Canada]","[{'id': '4edd64a0c7ddd24ca188df1a', 'name': 'F...",0,[],,
3,e-0-4b52112bf964a520416527e3-3,0,"[{'summary': 'This spot is popular', 'type': '...",4b52112bf964a520416527e3,DAVIDsTEA,2285 Bloor St W,,43.650584,-79.478853,"[{'label': 'display', 'lat': 43.65058422430476...",...,CA,Toronto,ON,Canada,"[2285 Bloor St W, Toronto ON M6S 1P1, Canada]","[{'id': '4bf58dd8d48988d1dc931735', 'name': 'T...",0,[],,
4,e-0-5434a336498e25b50077e07d-4,0,"[{'summary': 'This spot is popular', 'type': '...",5434a336498e25b50077e07d,Campo,244 Jane St.,Jane And Bloor,43.655191,-79.487067,"[{'label': 'display', 'lat': 43.655191, 'lng':...",...,CA,Toronto,ON,Canada,"[244 Jane St. (Jane And Bloor), Toronto ON M6S...","[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",0,[],,Humberside


Now we can see the venues in the area. Let's investigate categories of venues:

In [76]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The Good Fork,Food,43.649565,-79.484023
1,Asa Sushi,Sushi Restaurant,43.649902,-79.484611
2,The One That Got Away,Fish & Chips Shop,43.649842,-79.482615
3,DAVIDsTEA,Tea Room,43.650584,-79.478853
4,Campo,Italian Restaurant,43.655191,-79.487067


In [77]:
postal_map = folium.Map(location=[43.753259, -79.329656], zoom_start = 11)

for lat, lng, label in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.3,
        parse_html=False).add_to(postal_map)     
    
for lat, lng, label in zip(not_Toronto_df['Latitude'], not_Toronto_df['Longitude'], not_Toronto_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.3,
        parse_html=False).add_to(postal_map)     

postal_map

The property management team has been successful with building properties in Manhattan, New York. Let's take a look at Manhattan before we explore more of the Toronto neighbourhoods:

In [78]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')



Data downloaded!


In [79]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)


Let's look at the data:

In [85]:
newyork_data
neighbourhood_data=newyork_data['features']

In [86]:
neighbourhood_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

Let's transform the data into a dataframe with the columns 'Borough','Neighbourhood','Latitude' and 'Longitude'

In [87]:
#define column names
column_names=['Borough', 'Neighbourhood', 'Latitude', 'Longitude']

In [88]:
#create empty dataframe
neighbourhoods=pd.DataFrame(columns=column_names)

In [89]:
neighbourhoods

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude


In [90]:
#Loop the data and fill the dataframe row by row
for data in neighbourhood_data:
    borough = neighbourhood_name = data['properties']['borough'] 
    neighbourhood_name = data['properties']['name']
        
    neighbourhood_latlon = data['geometry']['coordinates']
    neighbourhood_lat = neighbourhood_latlon[1]
    neighbourhood_lon = neighbourhood_latlon[0]
    
    neighbourhoods = neighbourhoods.append({'Borough': borough,
                                          'Neighbourhood': neighbourhood_name,
                                          'Latitude': neighbourhood_lat,
                                          'Longitude': neighbourhood_lon}, ignore_index=True)


Let's view the dataframe!

In [91]:
neighbourhoods.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [92]:
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(
        len(neighbourhoods['Borough'].unique()),
        neighbourhoods.shape[0]
    )
)


The dataframe has 5 boroughs and 306 neighbourhoods.


In order to define an instance of geocoder, we need to define a user_agent. Let's name our's ny_explorer:

In [93]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of New York City are 40.7127281, -74.0060152.


Create a map of New York City with the neighbourhoods superimposed on top:

In [94]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(neighbourhoods['Latitude'], neighbourhoods['Longitude'], neighbourhoods['Borough'], neighbourhoods['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork


Pretty cool! You can zoom in to the above map and click on each circle mark to reveal the name of the neighborhood and its respective borough.

Let's simplify the above map and segment and cluster only the neighborhoods in Manhattan. First, let's slice the original dataframe and create a new dataframe of the Manhattan data.


In [95]:
manhattan_data = neighbourhoods[neighbourhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()


Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [96]:
address = 'Manhattan, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Manhattan are 40.7900869, -73.9598295.


In [98]:
# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan


Now that we've got our maps and clusters started, let initialise Foursquare API to explore the neighbourhoods!

In [99]:
# define Foursquar credentials
CLIENT_ID = 'M3IDXSOOZWUIIRWRCND3AXKSZYRJWJJEMRKEBTKMOIVVBXA2'
CLIENT_SECRET = 'G0FJEHYLIDPREQ2QOIM2MANQC2KGR25PTNPCM3TGVZVNG4N'
VERSION = '20180604'

In [100]:
# view the neighbourhood name
manhattan_data.loc[0, 'Neighbourhood']
# get the neighbourhood's latitude and longitude
neighbourhood_latitude = manhattan_data.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = manhattan_data.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = manhattan_data.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))


Latitude and longitude values of Marble Hill are 40.87655077879964, -73.91065965862981.


Now I want to look specifically at Marble Hill, within a radius of 500 meters. I first need to create a GET request and URL, which I will name 'url':

In [103]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name'],
            v['venue']['id']) for v in results])

    nearby_venues = pd.DataFrame([item for venues_list in venues_list for item in venues_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category',
                  'Venue ID'          
                            ]
    
    return(nearby_venues)

In [104]:
mask = neighbourhoods['Borough'] == 'Marble Hill' #Make a boolean mask for Manhattan Borough
df = neighbourhoods.copy() #Make a copy to leave our original DF intact
df = df[mask] #Now use the mask to eliminate entries not in Manhattan
df.reset_index(inplace = True) #Now reset the index so we can use it
df.drop('index', axis = 1, inplace = True)
df.head() #And take a look at our work so far

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude


In [106]:
radius = 500
LIMIT = 20

marblehill_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

ValueError: Length mismatch: Expected axis has 0 elements, new values have 8 elements