# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

I will use amazing BeautifulSoup library to parse content from Wikipedia.

In [210]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import folium
from geopy.geocoders import Nominatim

In [149]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [150]:
# Using requests library get the code of the page as text, then create a BeautifulSoup object for further operations.
page_content = requests.get(url).text
soup = BeautifulSoup(page_content, 'html.parser')

In [124]:
# Find parameters of required table and find its code
table = soup.find('table', {'class':'wikitable sortable'})

In [165]:
# "tr" tag indicates a row. Iterate through all rows and append 
rows = table.find_all('tr')
headers = []
data = []
parsed_headers = False
for row in rows:
    
    if not parsed_headers and len(row.find_all('th')) > 0:
        for header in row.find_all('th'):
            headers.append(header.string[:-1])
    else:
        temp_data = {}
        for i in range(len(headers)):
            temp_data[headers[i]] = row.find_all('td')[i].string[:-1]
        data.append(temp_data)
print(f'{len(data)} rows of data')

180 rows of data


In [167]:
# creating a dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [169]:
# dropping 'Not assigned Boroughs'
df = df[df.Borough != 'Not assigned']
df

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [176]:
# checking if there is duplicate postal codes
if len(df['Postal Code'].unique()) == df.shape[0]:
    print('OK --- No duplicate postal codes')
else:
    print(f'Not all postal codes are unique')
    
if (df.Neighborhood == "Not assigned").sum() == 0:
    print(f'OK --- No "Not assigned" Neighborhoods')
else:
    print(f'{(df.Neighborhood == "Not assigned").sum()} not assigned neighborhoods')

OK --- No duplicate postal codes
OK --- No "Not assigned" Neighborhoods


## Getting coordinates of postal codes

In [188]:
geocodes = pd.read_csv('http://cocl.us/Geospatial_data')

In [189]:
geocodes

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [193]:
#merging dataframes
print(f'Initial shape {df.shape[0]}')
df = df.merge(geocodes)
print(f'After merge shape {df.shape[0]}')

Initial shape 103
After merge shape 103


In [198]:
#checking for missing values
df.isnull().sum()

Postal Code     0
Borough         0
Neighborhood    0
Latitude        0
Longitude       0
dtype: int64

In [200]:
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# Exploration

In [217]:
# select region for analysis
region = 'Toronto'
data = df[df.Borough.str.contains(region)]
data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [218]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [219]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['Borough'], data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto