## Toronto Neighborhood Analysis

In [3]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install folium

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [7]:
from bs4 import BeautifulSoup
import re
from urllib.request import urlopen

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import numpy as np
import json # library to handle JSON files
import requests # library to handle requests

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
print('Libraries imported.')

Libraries imported.


In [8]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urlopen(url)
html_bytes = page.read()
html = html_bytes.decode("utf-8")
soup = BeautifulSoup(html, 'html.parser')

In [10]:
postal_codes = []
boroughs = []
neighborhoods = []

table = soup.find("table", {"class" : "wikitable sortable"})
rows = table.find_all("tr")
for row in rows:
    cells = row.find_all("td")
    if(len(cells) == 3):        
        postal_codes.append(re.sub("\n", "", cells[0].text))
        boroughs.append(re.sub("\n", "", cells[1].text))
        neighborhoods.append(re.sub("\n", "", cells[2].text))

dict = {"PostalCode" : postal_codes, "Borough" : boroughs, "Neighborhood" : neighborhoods}
toronto_df = pd.DataFrame(dict)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### 2. Transform Data

In [11]:
toronto_clean_df = toronto_df[toronto_df['Borough'] != "Not assigned"]
toronto_clean_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [26]:
toronto_clean_df.shape

(103, 3)

## Explore Data

In [12]:
toronto_clean_df['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64

In [10]:
# Same neighborhood sharing one Postal code - 1 postal code to many neighborhoods
# same neighborhood having different postal codes - n postal code to 1 neighborhood
toronto_clean_df['Neighborhood'].value_counts()

Downsview                                                                                                                                 4
Don Mills                                                                                                                                 2
Little Portugal, Trinity                                                                                                                  1
Humberlea, Emery                                                                                                                          1
Business reply mail Processing Centre, South Central Letter Processing Plant Toronto                                                      1
Toronto Dominion Centre, Design Exchange                                                                                                  1
Davisville                                                                                                                                1
Bathurst Manor, Wils

In [13]:
num_boroughs = len(toronto_clean_df['Borough'].unique())
num_neighborhood = len(toronto_clean_df['Neighborhood'].unique())
print("We have {} boroughs and {} neighborhoods".format(num_boroughs, num_neighborhood))

We have 10 boroughs and 99 neighborhoods


### Add Latitude and Longitude values for Toronto boroughs and neighborhoods. This is done using Geocoder from Python Package

In [None]:
# import geocoder

# def get_geocoder_lat_long(code):
#     print(code)
#     # initialize your variable to None
#     lat_lng_coords = None
#     # loop until you get the coordinates
#     while(lat_lng_coords is None):
#         g = geocoder.google('{}, Toronto, Ontario'.format(code))
#         lat_lng_coords = g.latlng
#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]
#     return latitude,longitude

# for i in range(0,len(toronto_clean_df)):
#     toronto_clean_df['Latitude'][i],toronto_clean_df['Longitude'][i]=get_geocoder_lat_long(toronto_clean_df.iloc[i]['PostalCode'])
    
# toronto_clean_df.head()

M3A


In [19]:
import geocoder

def get_geocoder_lat_long(code):
    print(code)
    lat_long_coords = None
    while(lat_long_coords is None):
        print("inside while")
        g = geocoder.google('{}, Toronto, Ontario'.format(code))    
        print("found g")
        lat_lng_coords = g.latlng
        print("found coordinates lat : {}, long : {}".format(lat_lng_coords[0], lat_lng_coords[1]))
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude

for i in range(0,len(toronto_clean_df)):
    print(i)
    toronto_clean_df['Latitude'][i],toronto_clean_df['Longitude'][i]=get_geocoder_lat_long(toronto_clean_df.iloc[i]['PostalCode'])

0
M3A
inside while


KeyboardInterrupt: 

In [20]:
geo_url = "http://cocl.us/Geospatial_data"
# url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
geo_df = pd.read_csv(geo_url)

### Merge toronto neighborhood data with latitude and longitude data

In [21]:
# toronto_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged = pd.merge(toronto_clean_df, geo_df, left_on='PostalCode', right_on='Postal Code')
toronto_merged.drop("Postal Code", axis=1, inplace=True)
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [23]:
toronto_merged.columns

Index(['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'], dtype='object')

In [24]:
sorted_df = toronto_merged.sort_values(by='Borough', ascending=True, axis=0)
sorted_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
86,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
79,M4S,Central Toronto,Davisville,43.704324,-79.38879
83,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
68,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
74,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678


In [25]:
sorted_df['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64