### Import necessary modules

In [1]:
import requests
from bs4 import BeautifulSoup as bs

import json # library to handle JSON files

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import geocoder

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library

import warnings
warnings.filterwarnings("ignore")

### use the requests module to get the html content at the wiki page and store the results in the variable 'html'

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url)
html.text[:100]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'

### parse the html using BeautifulSoup 

In [3]:
soup = bs(html.content,'html.parser')
soup.text[:100]

'\n\n\n\nList of postal codes of Canada: M - Wikipedia\ndocument.documentElement.className = document.docu'

### Extract the table information from the Beautiful soup object

In [4]:
table = soup.find('table')
table.text[:100]

'\n\nPostcode\nBorough\nNeighbourhood\n\n\nM1A\nNot assigned\nNot assigned\n\n\nM2A\nNot assigned\nNot assigned\n\n\nM'

### create a dataframe from the table variable

In [5]:
df = pd.read_html(str(table))[0]
df.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### grab the first row of the dataframe in order to set them as the columns

In [6]:
cols = df.loc[0]
cols

0         Postcode
1          Borough
2    Neighbourhood
Name: 0, dtype: object

### update the dataframe by setting the correct values for the column names

In [7]:
df2 = df.loc[1:]
df2.columns = cols
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


### Only grab those rows that have a Borough assigned

In [8]:
df3 = df2[df2['Borough'] != 'Not assigned']
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


### Check to see how many Neighbourhood entries with a value of 'Not assigned' can be assigned the corresponding 'Borough' value.  There is only one: Queen's Park.

In [9]:
df3['Borough'][df3.Neighbourhood == 'Not assigned'].value_counts()

Queen's Park    1
Name: Borough, dtype: int64

### Check the remaining unassigned Neighbourhood

In [10]:
df3[df3['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
9,M7A,Queen's Park,Not assigned


### Set the Neighbourhood "Not Assigned" value to corresponding "Borough" value

In [11]:
mask = df3['Neighbourhood'] == 'Not assigned'
df3.loc[mask,'Neighbourhood'] = df3.loc[mask,'Borough']

### Confirm that the change took effect

In [12]:
df3[df3['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


### Group the Neighbourhoods by Postcode and Borough
- apply the "join" method, using ", " to the Neighbourhood column in each group to create a single string of comma separated values of all the neighbourhoods in that the group.

In [13]:
df_groups = df3.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ", ".join(x)).to_frame().reset_index()
df_groups

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Print the shape of the Dataframe to show the number of rows and columns

In [14]:
df_groups.shape

(103, 3)

### Create a new dataframe that is a copy of the DF from part 1 to perform the operations required for part 2

In [15]:
df4 = df_groups.copy()

### Get the latitude and longitude
- attempt to use the geocoder module

In [16]:
import geocoder

#### use the first postal code to test

In [17]:
postal_code = df4.loc[0,'Postcode']
postal_code

'M1B'

#### get the lat and long for the M1B postal code

In [18]:
g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
lat_lng_coords = g.latlng

#### assign the lat and long values to their repsective variables
- Note: the geocoder module is not working on this single example

In [19]:
latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

TypeError: 'NoneType' object is not subscriptable

### Try 10 attempts to get the geocoder module to work
- note: the geocoder module fails after 10 attempts

In [20]:
geocoder.google('{}, Toronto, Ontario'.format(postal_code))

<[OVER_QUERY_LIMIT] Google - Geocode [empty]>

In [21]:
postal_code = 'M1V'
postal_code

'M1V'

In [22]:
# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
counter = 1
while(counter < 11):
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng
    print(counter)
    counter+=1

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

print(counter)
print('Latitude {}'.format(latitude))
print('Longitude {}'.format(longitude))

1
2
3
4
5
6
7
8
9
10


TypeError: 'NoneType' object is not subscriptable

### Conclusion: the geocoder module is too unreliable, therefore use the supplied CSV of latitude and longitude values for each Postal Code

### Load the latitude and longitude data stored in the supplied csv

In [23]:
df_coordinates = pd.read_csv('../notebooks/data/Geospatial_Coordinates.csv')
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### rename the CSV file column name "Postal Code" to match the "Postcode" column name from the wikipage

In [24]:
df_coordinates.rename(columns={'Postal Code':df4.columns[0]},inplace=True)
df_coordinates.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the latitude and longitude dataframes on the "Postcode" column

In [25]:
df5 = df4.merge(df_coordinates,on='Postcode')
df5

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Check for Null values to see if merge did not work anywhere

In [26]:
df5.isnull().sum()

Postcode         0
Borough          0
Neighbourhood    0
Latitude         0
Longitude        0
dtype: int64

### Explore and cluster the neighborhoods in Toronto
- start by finding only those 

In [27]:
toronto_idx = df5.Borough.str.contains("Toronto")
toronto_df = df5.loc[toronto_idx,:]
toronto_df.Borough.value_counts()

Downtown Toronto    18
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64

In [28]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [None]:
# create column indicating these are neighborhoods for the all Toronto boroughs
# move this column to be placed after the Postcode Column
toronto_df['Greater_City_Area'] = 'Toronto'
toronto_df.head()

In [None]:
# create column of list of neighborhoods for each postcode
toronto_df['Neighbourhood_List'] = toronto_df['Neighbourhood'].str.split(",")
toronto_df.head()

In [None]:
# create a temp_df to that contains a row entry for each of the neighbourhoods in each postcode
# this will be merged with the 
temp_df = toronto_df['Neighbourhood_List'].apply(lambda x: pd.Series(x)).stack().to_frame('Unique_Neighbourhoods')
temp_df.head()

In [None]:
# set the indices to match that of the toronto_df
temp_df.reset_index(level=1,drop=True,inplace=True)
temp_df.head()

In [None]:
print('The dataframe has {} boroughs and {} neighborhoods in the greater Toronto area.'.format(
        len(toronto_df['Borough'].unique()),
        toronto_df.shape[0]
    )
)

In [None]:
# join the toronto_df and the temp_df on the index values to get a unique row for each neighbourhood
toronto_df = toronto_df.join(temp_df)

In [None]:
# remove the list of Neighbourhood and Neighbourhood_List columns
drop_cols = ['Neighbourhood','Neighbourhood_List']
toronto_df.drop(drop_cols,axis=1,inplace=True)
toronto_df.head()

In [None]:
# clean up and organize the dataframe
# rename latitude and longitude to post_lat and post_long to distinguish from neighbourhood 
# lat and long which will be called neighbourhood_lat and neighbourhood_long
toronto_df.head()

In [None]:
# get the lat and long for each neighbourhood in "Unique_Neighbourhoods
neighbourhoods = toronto_df.Unique_Neighbourhoods.str.strip().values.tolist()
len(neighbourhoods)

In [None]:
neighbourhoods.count("The Beaches")

In [None]:
address = 'The Beaches, Toronto, ON'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of {} are {}, {}.'.format(address,latitude, longitude))

In [None]:
def get_neighbourhood_coordinates(row):
    address = row['Unique_Neighbourhoods']
    geolocator = Nominatim()
    flag = True
    latitude = -1
    longitude = -1
    counter = 1
    while flag:
        try:
            location = geolocator.geocode(address)
            latitude = location.latitude
            longitude = location.longitude
            flag = False
            print('The geographical coordinates of {} are {}, {}.'.format(address,latitude, longitude))
            print('It took attempts {} to get these coordianates.'.format(counter))
            return (latitude,longitude)
            
        except:
            print(counter)
            counter +=1
            

In [None]:
toronto_df['coordinates'] = toronto_df.apply(get_neighbourhood_coordinates,axis=1)

In [None]:
# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
counter = 1
while(counter < 11):
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng
    print(counter)
    counter+=1

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

print(counter)
print('Latitude {}'.format(latitude))
print('Longitude {}'.format(longitude))

In [None]:
# get the lat and long for Toronto
address = 'Toronto, ON'

#geolocator = Nominatim()
#location = geolocator.geocode(address)
latitude = 49.7849789
longitude = -101.6689683
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)
map_newyork

In [None]:
latitude = 43.6757898
longitude = -79.4025603

In [None]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)
map_newyork

In [None]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork