### Introduction   
This notebook scrapes the Wikipedia Page [List of postal codes of Canada: M](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M) for the Toronto postal codes and then using the Foursquare API clusters the neighborhoods based on venues.

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

!conda install -c conda-forge beautifulsoup4 --yes 
from bs4 import BeautifulSoup # crawl wikipedia website

!conda install -c conda-forge geocoder --yes
import geocoder

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  26.90 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  10.68 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00   9.19 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  36.34 MB/s
vincent-0.4.4- 100% |###################

In [2]:
import requests

Requesting Wikipedia Data and parsing it with beautifulSoup

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
html = BeautifulSoup(r.text, 'lxml')

Extracting the table data and adding it to a dataframe

In [4]:
postalcodes_table = html.find('table',{'class':'wikitable sortable'})

columns = ['Postcode', 'Borough', 'Neighbourhood']

postalcodes_arr=[]
for row in postalcodes_table.findAll('tr'):
    row_data = []
    for r in row.findAll('td'):
        row_data.append(r.get_text().replace('\n', ''))
    if len(row_data) != 0:
        postalcodes_arr.append(row_data)

postalcodes_df = pd.DataFrame(postalcodes_arr, columns=columns)

postalcodes_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Processing the DataFrame
* dropping 'Not assigned' values for Borough


In [5]:
# Removes rows with 'Not Assigned' Boroughs
postalcodes_df.drop(postalcodes_df.index[postalcodes_df['Borough'] == 'Not assigned'], inplace = True)

#Reset index
postalcodes_df.reset_index(drop=True, inplace=True)

#Replaces 'Not Assigned' Neighborhood names with Borough name
for index, row in postalcodes_df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']
        
#Aggregates the Neighborhoods with a comma in between
postalcodes_df = postalcodes_df.groupby(['Postcode','Borough']).agg(", ".join).reset_index()

In [6]:
postalcodes_df.shape
postalcodes_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
postal_code_coords = []
for postal_code in postalcodes_df['Postcode'].unique:
    # initialize your variable to None
    lat_lng_coords = None
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng
    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    postal_code_coords = [postal_code, latitude, longitude]
    

NameError: name 'postal_code' is not defined

In [8]:
postalcodes_df['Postcode'].unique

<bound method Series.unique of 0      M1B
1      M1C
2      M1E
3      M1G
4      M1H
5      M1J
6      M1K
7      M1L
8      M1M
9      M1N
10     M1P
11     M1R
12     M1S
13     M1T
14     M1V
15     M1W
16     M1X
17     M2H
18     M2J
19     M2K
20     M2L
21     M2M
22     M2N
23     M2P
24     M2R
25     M3A
26     M3B
27     M3C
28     M3H
29     M3J
30     M3K
31     M3L
32     M3M
33     M3N
34     M4A
35     M4B
36     M4C
37     M4E
38     M4G
39     M4H
40     M4J
41     M4K
42     M4L
43     M4M
44     M4N
45     M4P
46     M4R
47     M4S
48     M4T
49     M4V
50     M4W
51     M4X
52     M4Y
53     M5A
54     M5B
55     M5C
56     M5E
57     M5G
58     M5H
59     M5J
60     M5K
61     M5L
62     M5M
63     M5N
64     M5P
65     M5R
66     M5S
67     M5T
68     M5V
69     M5W
70     M5X
71     M6A
72     M6B
73     M6C
74     M6E
75     M6G
76     M6H
77     M6J
78     M6K
79     M6L
80     M6M
81     M6N
82     M6P
83     M6R
84     M6S
85     M7A
86     M7R
87     M7Y
8