### Installing necessary libraries for scraping

In [1]:
#installing the required libraries

!conda install -c conda-forge beautifulsoup4 --yes
!conda install -c conda-forge html5lib --yes
!conda install -c conda-forge lxml --yes

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    soupsieve-1.9.2            |           py36_0          59 KB  conda-forge
    beautifulsoup4-4.8.0       |           py36_0         144 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         202 KB

The following NEW packages will be INSTALLED:

    soupsieve:      1.9.2-py36_0 conda-forge

The following packages will be UPDATED:

    beautifulsoup4: 4.6.3-py37_0             --> 4.8.0-py36_0 conda-forge


Downloading and Extracting Packages
soupsieve-1.9.2      | 59 KB     |

In [51]:
#importing the libraries

import requests
import pandas as pd
from bs4 import BeautifulSoup

In [58]:
#getting the source code from wikipedia to process

source_code = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
code_text = BeautifulSoup(source_code, 'lxml')


In [59]:
#geting the table and it's rows where data is stored
table = code_text.find('table')
col= table.find_all('tr')
row=col[1:]

#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

for i in row:
    
    #iterating through every data in the table
    col = i.find_all('td')
    
    #converting data in html to plain text
    postal=col[0].text
    borough=col[1].text
    
    #removing the '\n' from every value in last row
    neighbor=col[2].text[:len(col[2].text)-1]
    
    #storing the data retreived into a dataframe
    neighborhoods = neighborhoods.append({'PostalCode': postal,
                                          'Borough': borough,
                                          'Neighborhood': neighbor}, ignore_index=True)
    
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [60]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

neighborhoods = neighborhoods[neighborhoods.Borough != 'Not assigned'].reset_index(drop=True)
neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [61]:
#More than one neighborhood can exist in one postal code area. These two rows will be combined into one row 
#with the neighborhoods separated with a comma

neighborhoods= neighborhoods.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [62]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

neighborhoods.loc[neighborhoods['Neighborhood'] == 'Not assigned', 'Neighborhood'] = neighborhoods['Borough']
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [64]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

neighborhoods.shape

(103, 3)