##### First we import all libraries required for this notebook

In [24]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests
from bs4 import BeautifulSoup

##### Read the Toronto postal code data from wiki
##### Use the BeautifulSoup library to scrap the data

In [25]:
toronto_neigh  = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
toronto_neigh_text = toronto_neigh.text
toronto_neigh_soup = BeautifulSoup(toronto_neigh_text, "lxml")

##### Create dataframe with 3 columns using pandas. 

In [26]:
pd.options.display.max_rows = 999
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 
# instantiate the dataframe
toronto_nhoods = pd.DataFrame(columns=column_names)

##### Scrap the data by reading the main table using its classname. 
##### Read all rows. If the row does not contain 3  td's then ignore this row as this row will not contain relevant data
##### Check if value of Borough is 'Not assigned'. If it is, then ignore this row.
##### If the value of Borough is 'Not assigned' but the value of Neighborhood is 'Not assigned', then assign value of Burrough to Neighborhood for this row.
##### FILL the dataframe with data for 3 columns

In [27]:
table_main = toronto_neigh_soup.find('table', class_='wikitable sortable')

tablebody_main = table_main.find('tbody')
#print(tablebody_main)
tablerows_all = tablebody_main.find_all('tr')

for row in tablerows_all:
    td_postalcode = row.find_all('td')
    
    if len(td_postalcode) == 3:
        if "Not assigned" not in td_postalcode[1]:
            
            if (td_postalcode[2].text.replace("\n","") == "Not assigned"):
                nName = td_postalcode[1].text
            else:
                nName = td_postalcode[2].text.replace("\n","")
                                
            toronto_nhoods = toronto_nhoods.append({'PostalCode': td_postalcode[0].text ,
                                                  'Borough': td_postalcode[1].text ,
                                                  'Neighborhood': nName,
                                                      }, ignore_index=True)
    
    
    

##### Verify the data

In [29]:
toronto_nhoods.head(50)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


##### We want to group the data by Postalcode, Borough and then concatenate (join) Neighborhood column so that we get a unique row for each Postalcode/Borough 

In [30]:
toronto_nhood_grouped = toronto_nhoods.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).reset_index()
toronto_nhood_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


##### Use the shape method to get the number of rows and columns in the dataframe
##### There are 103 rows and 3 columns

In [31]:
toronto_nhood_grouped.shape

(103, 3)