# Part 1: Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe like the one shown below...

In [1]:
import requests
import lxml.html as lh
import pandas as pd
import html5lib

In [2]:
from pandas.io.html import read_html
page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

wikitables = read_html(page,  attrs={"class":"wikitable"})

print ("Extracted {num} wikitables".format(num=len(wikitables)))

Extracted 1 wikitables


In [3]:
wikitables[0]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
8,M8A,Not assigned,Not assigned
9,M9A,Downtown Toronto,Queen's Park


In [4]:
file_name = './my_file.csv'
wikitables[0].to_csv(file_name)

In [5]:
!find . -type f -name "*csv"

./my_file.csv


In [6]:
df = pd.read_csv(file_name)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood
0,0,M1A,Not assigned,Not assigned
1,1,M2A,Not assigned,Not assigned
2,2,M3A,North York,Parkwoods
3,3,M4A,North York,Victoria Village
4,4,M5A,Downtown Toronto,Harbourfront


In [8]:
df.drop(df.index[0])
df_no = df.drop(df.index[0])
df_no

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood
1,1,M2A,Not assigned,Not assigned
2,2,M3A,North York,Parkwoods
3,3,M4A,North York,Victoria Village
4,4,M5A,Downtown Toronto,Harbourfront
5,5,M6A,North York,Lawrence Heights
6,6,M6A,North York,Lawrence Manor
7,7,M7A,Queen's Park,Not assigned
8,8,M8A,Not assigned,Not assigned
9,9,M9A,Downtown Toronto,Queen's Park
10,10,M1B,Scarborough,Rouge


In [9]:
df_no.drop(df_no.index[df_no['Borough'] == 'Not assigned'], inplace = True)

df_no = df_no.reset_index(drop=True)

In [10]:
df_no.head(10)

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M6A,North York,Lawrence Heights
4,6,M6A,North York,Lawrence Manor
5,7,M7A,Queen's Park,Not assigned
6,9,M9A,Downtown Toronto,Queen's Park
7,10,M1B,Scarborough,Rouge
8,11,M1B,Scarborough,Malvern
9,13,M3B,North York,Don Mills North


In [11]:
df_no.loc[df_no['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df_no['Borough']

In [12]:
df_no.head(10)

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M6A,North York,Lawrence Heights
4,6,M6A,North York,Lawrence Manor
5,7,M7A,Queen's Park,Queen's Park
6,9,M9A,Downtown Toronto,Queen's Park
7,10,M1B,Scarborough,Rouge
8,11,M1B,Scarborough,Malvern
9,13,M3B,North York,Don Mills North


In [13]:
df_no.groupby('Postcode').count()

Unnamed: 0_level_0,Unnamed: 0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M1B,2,2,2
M1C,3,3,3
M1E,3,3,3
M1G,1,1,1
M1H,1,1,1
M1J,1,1,1
M1K,3,3,3
M1L,3,3,3
M1M,3,3,3
M1N,2,2,2


In [14]:
df_no = df_no.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(','.join).reset_index()
df_no.columns = ['Postcode','Borough','Neighbourhood']
df_no.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [15]:
df_no.shape

(103, 3)

In [16]:
file_name = './tor.csv'
df_no.to_csv(file_name)

In [17]:
!find . -type f -name "*csv"

./my_file.csv
./tor.csv


## Part 2: Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

In [18]:
import numpy as np 
import pandas as pd 

print('Libraries imported.')

Libraries imported.


In [19]:
df_latlon = pd.read_csv('http://cocl.us/Geospatial_data')

In [20]:
df_latlon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
df_tor = pd.read_csv(file_name)

In [22]:
df_tor.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood
0,0,M1B,Scarborough,"Rouge,Malvern"
1,1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,3,M1G,Scarborough,Woburn
4,4,M1H,Scarborough,Cedarbrae


In [23]:
df_latlon = df_latlon.rename(columns = {'Postal Code' : 'Postcode'})

In [24]:
df_tll = pd.merge(df_tor, df_latlon, on='Postcode')
df_tll.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [25]:
file_name = './tll.csv'
df_tll.to_csv(file_name)

## Part 3: Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.

In [27]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    altair-4.0.0               |             py_0         606 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         704 KB

The following NEW packages will be INSTALLED:

    altair:  4.0.0-py_0 conda-forge
    branca:  0.3.1-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Down