# Part 1:

# Import packages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Get Wikipedia Page Source

In [2]:
html_doc = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(html_doc, 'html.parser')

# Scrape Html To Get Post Code, Borough, and Neighborhood Lists

In [84]:
postalCodeL = []
boroughL = []
neighborhoodL = []

for line in soup.find('table').find_all('tr'):
    cells = line.find_all('td')
    if(len(cells) > 0):
        postalCodeL.append(cells[0].text)
        boroughL.append(cells[1].text)
        neighborhoodL.append(cells[2].text.replace("\n",""))

#print(postalCodeL)
#print(boroughL)
#print(neighborhoodL)

# Add PostCode, Borough, and Neighborhood Lists into Pandas Dataframe & Remove Not Assigned Boroughs

In [110]:
toronto_df = pd.DataFrame()
toronto_df['PostalCode'] = postalCodeL
toronto_df['Borough'] = boroughL
toronto_df['Neighborhood'] = neighborhoodL

toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


# Remove Duplicate PostalCode By Grouping Neighborhoods by PostalCode & Borough

In [112]:
toronto_df = toronto_df.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Fix the Not Assigned Neighborhood

In [113]:
na_neigh_rows = toronto_df.Neighborhood == 'Not assigned'
toronto_df.loc[na_neigh_rows, 'Neighborhood'] = toronto_df.loc[na_neigh_rows, 'Borough']
toronto_df[na_neigh_rows]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


# Clean up and Show Shape of Data Frame

In [117]:
toronto_df_cleaned = toronto_df
print(toronto_df_cleaned.shape)
toronto_df_cleaned.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Part 2:

# Load .csv file with latitude and longitude coordinates for Toronto Neighborhoods

In [118]:
!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data
print('Downloaded!')

Downloaded!


In [119]:
coor = pd.read_csv('Geospatial_Coordinates.csv')

In [120]:
coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Set Indexes of the two data frames to Postal Code Columns and then merge them

In [125]:
toronto_df_2 = toronto_df_cleaned.set_index('PostalCode')
coor_2 = coor.set_index('Postal Code')
toronto_df_coor = pd.concat([toronto_df_2, coor_2], axis=1, join='inner')

# Reset Index 

In [123]:
toronto_df_coor.index.name = 'PostalCode'
toronto_df_coor.reset_index(inplace=True)

# Print Shape and first 12 rows of merged dataframes

In [126]:
print(toronto_df_coor.shape)
toronto_df_coor.head(12)

(103, 4)


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
M1J,Scarborough,Scarborough Village,43.744734,-79.239476
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


# Part 3:

In [128]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.19.0-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  24.91 MB/s
geopy-1.19.0-p 100% |################################| Time: 0:00:00  35.84 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  54.72 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  29.08 MB/s
vincent-0.4.4- 100% |###################

# Get the Coordinates for Toronto

In [133]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


# Create Map of Toronto Using Latitude and Longitude Values and Adding All Neighborhood Markers to the Map

In [141]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df_coor['Latitude'], toronto_df_coor['Longitude'], toronto_df_coor['Borough'], toronto_df_coor['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto