## Week 3 of the Capstone Project for the IBM Professional Data Science Certificate - Coursera

##### Import necessary libraries

In [57]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim 
import folium as f
print ('Pandas,Numpy & BeatufiulSoup have been imported!')

Pandas,Numpy & BeatufiulSoup have been imported!


Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe like the one shown below:

In [58]:
sourceweb = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(sourceweb, 'html.parser')

##### Getting the required information from the website without any further editting.

In [59]:
postalCodeList = []
boroughList = []
neighborhoodList = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))

##### Fill the table with the list from the website - 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [60]:
toronto_neighorhood = [('PostalCode', postalCodeList),
                      ('Borough', boroughList),
                      ('Neighborhood', neighborhoodList)]
toronto_df = pd.DataFrame.from_dict(dict(toronto_neighorhood))
toronto_df.head()

Unnamed: 0,Borough,Neighborhood,PostalCode
0,Not assigned,Not assigned,M1A
1,Not assigned,Not assigned,M2A
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,Harbourfront,M5A


##### Remove cells with "Not assigned" under borough and grouping neighborhoods sharing the same postal code

In [61]:
toronto_df_dropna = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)
toronto_df_grouped = toronto_df_dropna.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


##### Making unassigned neighborhoods the same as their respective borough. 

In [62]:
na_neigh_rows = toronto_df_grouped.Neighborhood == 'Not assigned'
toronto_df_grouped.loc[na_neigh_rows, 'Neighborhood'] = toronto_df_grouped.loc[na_neigh_rows, 'Borough']

##### Printing the number of rows of the dataframe after cleaning the data.

In [63]:
toronto_df_final = toronto_df_grouped
toronto_df_final.shape

(103, 3)

## Part 2

##### Getting latitude and longitude from the website specified as source

In [64]:
source2 = "http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv"
coor = pd.read_csv(source2)
coor.columns = ['PostalCode', 'Latitude', 'Longitude']
coor.rename({'Postal Code':'PostalCode'},axis=1, inplace=True)
coor.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


##### Creating a new dataframe by merging coor and toronto_df_final with a final dataframe of 5 (PostalCode, Borough, Neighborhood, Latitude and Longitude) columns and 103 rows 

In [65]:
df_LALO = pd.merge(toronto_df_final,coor,how='left', on='PostalCode')
df_LALO.head(30)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


### Part 3

In [66]:
geolocator = Nominatim()
loc = geolocator.geocode('Toronto')
lat = loc.latitude
long = loc.longitude
print('The coordinates of the city of Toronto are Lat:{}, Long:{}.'.format(lat, long))

  if __name__ == '__main__':


The coordinates of the city of Toronto are Lat:43.653963, Long:-79.387207.


In [70]:
plot_map = f.Map(location=[lat, long], zoom_start=9)
for lt, lng, label in zip(df_LALO['Latitude'], df_LALO['Longitude'], df_LALO['Neighborhood']):
    label = f.Popup(label, parse_html=True)
    f.CircleMarker(
        [lt, lng],
        radius=8,
        popup=label,
        color='green',
        fill=True,
        fill_opacity=0.4,
        parse_html=True).add_to(plot_map) 
plot_map

#### Map is too hard to understand so I zoom in more and change parameters to make it easier to the eye.

In [68]:
plot_map = f.Map(location=[lat, long], zoom_start=12)
for lt, lng, label in zip(df_LALO['Latitude'], df_LALO['Longitude'], df_LALO['Neighborhood']):
    label = f.Popup(label, parse_html=True)
    f.CircleMarker(
        [lt, lng],
        radius=10,
        popup=label,
        color='green',
        fill=True,
        fill_opacity=0.4,
        parse_html=True).add_to(plot_map) 
plot_map

##### We can see now all the different boroughs and their respective neighborhoods marked using their respective Latitute and Longitude.

![Toronto Map](/Map%20Toronto.JPG)