Capstone Project – The Battle of Neighborhoods | Finding a Better Place in Etobicoke, Toronto

In [1]:
#import io,request pandas and folium libraries
import io
import requests
import pandas as pd
import folium
import numpy as np
import geocoder
import matplotlib.cm as cm
import matplotlib.colors as colors
import json
import xml
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim 
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print("All Required Libraries Imported!")


All Required Libraries Imported!


In [2]:
#obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_list = pd.read_html(url)

In [3]:
len(df_list)

3

In [4]:
df=pd.read_html(url)[0]

In [5]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [6]:
df1 = pd.DataFrame(data=df)

In [7]:
df1

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [8]:
#Searching for null values
df1.isna().mean()*100

Postal Code      0.0
Borough          0.0
Neighbourhood    0.0
dtype: float64

In [9]:
indexNames = df1[ df1['Borough'] == 'Not assigned' ].index

In [10]:
df1.drop(indexNames , inplace=True)#droping the Borough with Not assigned value

In [11]:
df1

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [12]:
df1.shape #shape of required dataframe

(103, 3)

Checkpoint 2

In [13]:
#importing the csv file containing the coordinates of longitude and latitude
long_lat=pd.read_csv("Geospatial_Coordinates.csv")
long_lat

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [14]:
#Joining final dataframe and coordinate dataset
df_final=df1.join(long_lat.set_index('Postal Code'), on='Postal Code')

In [15]:
df_final.shape

(103, 5)

In [16]:
df_final.isna().mean()*100

Postal Code      0.0
Borough          0.0
Neighbourhood    0.0
Latitude         0.0
Longitude        0.0
dtype: float64

In [17]:
#Final Dataset for further operations
df_final

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
11,M3B,North York,Don Mills,43.745906,-79.352188
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


Checkpoint-3  clustering the neighborhoods in Toronto

In [18]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_final['Borough'].unique()),
        df_final.shape[0] ))

The dataframe has 10 boroughs and 103 neighborhoods.


In [25]:

address = 'Etobicoke, canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Etobicoke City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Etobicoke City are 43.6435559, -79.5656326.


In [26]:
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [27]:
CLIENT_ID = 'MFY5OA0F2YJNSZ4TLXP11F1UKC2ECDJBUW45BD2NKW3V4M3O' # my Foursquare ID
CLIENT_SECRET = 'E5M0QVILDI4S45QTEZS3CKKAR1DIMCJKWXBAR2FR55TSX3HU' # my Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: '+CLIENT_ID)
print('CLIENT_SECRET: '+CLIENT_SECRET)

Your credentails:
CLIENT_ID: MFY5OA0F2YJNSZ4TLXP11F1UKC2ECDJBUW45BD2NKW3V4M3O
CLIENT_SECRET: E5M0QVILDI4S45QTEZS3CKKAR1DIMCJKWXBAR2FR55TSX3HU


In [28]:
radius = 700 
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
   longitude, 
    radius, 
   LIMIT)
results = requests.get(url).json()

ConnectionError: HTTPSConnectionPool(host='api.foursquare.com', port=443): Max retries exceeded with url: /v2/venues/explore?&client_id=MFY5OA0F2YJNSZ4TLXP11F1UKC2ECDJBUW45BD2NKW3V4M3O&client_secret=E5M0QVILDI4S45QTEZS3CKKAR1DIMCJKWXBAR2FR55TSX3HU&v=20180604&ll=43.6435559,-79.5656326&radius=700&limit=100 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001CFE2DAF0C8>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))