<h2> Web scrapping using pandas and beautifulsoup </h2>
by: <b>Nur Cahyo Nugroho </b>

<h4>Import all necessary libraries </h4>

In [72]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

<h4> Call BeautifulSoup </h4>

In [None]:
#request to wiki based on URL
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

#parse to html.parser
soup = BeautifulSoup(res.content,'lxml')

<h4>Using BeautifulSoup to get HTML content from website for webscrapping</h4>
<ul>
    <li>get the first table with index [0] to get all record related to postal code </li>
    <li>ignore tr</li>
    <li>loop the record in tr and td and put all the logics before append the record into list </li>
</ul>

In [73]:
#get the first table in HTML contains all the postal code data, index no 0
table = soup.find_all('table')[0] 

#create empty list
postcode_list = []

#loop all records in tr
for tr in table.find_all('tr'):
    #ignore th
    if not tr.find_all('th'):
        td_val = ''
    
    #loop all records in td
    for td in tr.find_all('td'):
        td_val += td.get_text().strip('\n') + ','
      
    #identify record with 'Not assigned'
    if not td_val.split(',')[1].__contains__('Not assigned'):
        rec = td_val.split(',')[0:3]
        
        #check if it's still empty, for purpose to add first list
        if len(postcode_list) == 0:
            postcode_list.append(rec)
        else:
            exist = False
            
            #compare with existing record, and append the value in 'Neighborhood' if already exist
            for existing_rec in postcode_list:
                if existing_rec[0] == rec[0] and existing_rec[1] == rec[1]:
                    existing_rec[2] += ',' + rec[2]
                    exist = True
                    break
           
            #only add if record is not exist
            if exist == False:
                #check if 'Neighborhood' is not assigned
                if (rec[2]).__contains__('Not assigned'):
                    rec[2] = rec[1]
                postcode_list.append(rec)

postcode_list

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront,Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights,Lawrence Manor'],
 ['M7A', "Queen's Park", "Queen's Park"],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge,Malvern'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens,Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson,Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M9B',
  'Etobicoke',
  'Cloverdale,Islington,Martin Grove,Princess Gardens,West Deane Park'],
 ['M1C', 'Scarborough', 'Highland Creek,Rouge Hill,Port Union'],
 ['M3C', 'North York', 'Flemingdon Park,Don Mills South'],
 ['M4C', 'East York', 'Woodbine Heights'],
 ['M5C', 'Downtown Toronto', 'St. James Town'],
 ['M6C', 'York', 'Humewood-Cedarvale'],
 ['M9C',
  'Etobicoke',
  'Bloordale Gardens,Eringate,Markland Wood,Old Burnhamthorpe'],
 ['M1E', 'Scarborough', 'Guildwood,Morni

<h4> Create dataframe from the list </h4>

In [74]:
column_name = ['Postal Code', 'Borough', 'Neighborhood']
canada_postcode_df = pd.DataFrame(postcode_list, columns = column_name)
canada_postcode_df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


<h4> Display the shape of dataframe</h34

In [75]:
canada_postcode_df.shape

(103, 3)