### 1. This is the code for the Capstone Project - Coursera (IBM ML With Python)

In [5]:
# start by importing the relevant libraries
import pandas as pd
from  bs4 import BeautifulSoup as bs
import json
import requests
from pandas.io.json import json_normalize


### 2. Set the url and read the data from the website

In [6]:
# initialise and set the url variable from which data is to be scraped

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
data_list = bs(data, 'html.parser')

### 3. Setup the dataframe 

In [7]:
#setup the column headers in the Dataframe
df_list = pd.DataFrame (columns = ['Postcode', 'Borough','Neighbourhood'])

# populate the dataframe
for row in data_list.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        check_str = cells[1].text

# ignore all lines that have Borough set to "Not Assigned"
        if check_str.find ("Not assigned") == -1:
            new_row = {'Postcode':cells[0].text, 'Borough':cells[1].text,'Neighbourhood':cells[2].text}
            df_list = df_list.append(new_row, ignore_index=True)


### 4. Remove the /n characters from the data elements

In [8]:
# replace the /n characters being appended
df_list = df_list.replace('\n',' ', regex=True)


### 5. Print the dataframe shape as per instructions

In [9]:
print(df_list.shape)

(103, 3)


### 6. Before we start converting the post codes into latitude and longitude,  must install and import packages

In [10]:
!pip install pgeocode
import pgeocode
import folium
from geopy.geocoders import Nominatim
import numpy



### 7. Now find the longtitude and latitude information

In this logic, we first obtain the location details for TORONTO.  As we process the postcodes, some are returned as NaN, which causes issues for the folium and mapping commands.  So, while inelegant, this code defaults to the Toront co-ordinates for any postcode that returns NaN.

In this case I have used pgeocode rather than geocode or even the Excel sheet provided.  This is just so as to practice alternative approach to achieving the same effect.

In [19]:
#Start mapping - first create an initial map of the toronto Neighbourhood - this is located here as some postcodes are nulls.  In this case, the borough location is being defaulted to the Toronto location details.

address = 'Toronto'
gl = Nominatim(user_agent="govinda")
loc = gl.geocode(address)
lat = loc.latitude
long = loc.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(lat, long))

# Look up longitude and latitude using the poscode.  Using pgeocodes for this converison
nomi = pgeocode.Nominatim('ca')
pcodes = pd.DataFrame(columns= ['Postcode', 'Long','Lat'])

for index, row in df_list.iterrows():
    out = nomi.query_postal_code(row['Postcode'])
    if numpy.isnan(out[9]) or numpy.isnan(out[10]):
        new_row = {'Postcode':row['Postcode'], 'Long':long, 'Lat':lat}
    else:
        new_row = {'Postcode':row['Postcode'], 'Long':out[10], 'Lat':out[9]}
    pcodes = pcodes.append(new_row, ignore_index=True)


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### 8. Now merge the two DataFrames

In [20]:
#now merge the two dataframes

df_list_full = df_list.merge(pcodes, on='Postcode', how = 'left')

# print header to confirm that the data is updated correctly
print (df_list_full.head())


  Postcode            Borough                                 Neighbourhood  \
0     M3A         North York                                     Parkwoods    
1     M4A         North York                              Victoria Village    
2     M5A   Downtown Toronto                     Regent Park, Harbourfront    
3     M6A         North York              Lawrence Manor, Lawrence Heights    
4     M7A   Downtown Toronto   Queen's Park, Ontario Provincial Government    

      Long      Lat  
0 -79.3300  43.7545  
1 -79.3148  43.7276  
2 -79.3626  43.6555  
3 -79.4504  43.7223  
4 -79.3889  43.6641  


### 9.  Now we start plot the initial map to show the locations as identified above.  Using different fill colour for practice only.

In [25]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[lat, long], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_list_full['Lat'], df_list_full['Long'], df_list_full['Borough'], df_list_full['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#000000',
        fill_opacity=0.7).add_to(map_toronto)

map_toronto
