# Web scrape of search engine for Post Office Address

### *Andrew Leung - Feb 02, 2019*

## Libraries

In [1]:
from bs4 import BeautifulSoup
import os
import pandas as pd
import requests as requests

#extension for BeautifulSoup - Code highlighting - install this if needed: pip install "ipython-beautifulsoup[bs4,notebook]"
%load_ext soup
configure_ipython_beautifulsoup(show_html=True, 
                                show_css=False, 
                                show_js=False)

Monkey patch BeautifulSoup with custom rendering
See `configure_ipython_beautifulsoup?` for configuration information
Push 'BeautifulSoup' from 'bs4' into current context
Push 'urlopen' from 'urllib.request' into current context
Push 'p' shortcut into current context
Push 'requests' into current context


## URL of search results for Canada Post

In [5]:
#for one page example
url = 'https://www.canadapost.ca/cpotools/apps/fpo/personal/findPostOfficeList?lat=43.8004476&lng=-79.27473520000001'

response = requests.get(url)
print(response.text[:500])

<!DOCTYPE html >
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
    <head>

        <meta name="msvalidate.01" content="1AC9A4E083D8F5E10E160861745E4E0F" />
        <meta name="y_key" content="36a5b4f0d2c3995c" />
            <meta http-equiv="Content-Type" content="application/xhtml+xml, text/xml, text/html; charset=utf-8" />
            <meta http-equiv="content-language" content="en-CA" />
            <title>Canada Post - Find a Post Office - Results List</title>
    <meta http-equiv="


### Examine full html input

In [11]:
#print with .prettify() function for tabs
html_soup = BeautifulSoup(response.text,'html.parser')
print(html_soup.prettify())


<!DOCTYPE html >
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="1AC9A4E083D8F5E10E160861745E4E0F" name="msvalidate.01"/>
  <meta content="36a5b4f0d2c3995c" name="y_key"/>
  <meta content="application/xhtml+xml, text/xml, text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="en-CA" http-equiv="content-language"/>
  <title>
   Canada Post - Find a Post Office - Results List
  </title>
  <meta content="post office, personal, postal outlet, post office Canada, find a post office, locate a post office, post office finder, locator, locations, hours, hours of operation, postal outlet, postal outlets, post office locations" http-equiv="keywords"/>
  <meta content="Find a post office near you using our online tool. Enter your address or postal code and get nearby post offices, outlets, maps and hours." http-equiv="description"/>
  <meta content="Canada Post" name="categories"/>
  <meta content="Post office locator, find nearest post office and po

### Examine rendered webpage

In [13]:
#using the extension for BeautifulSoup to show syntax highlighting:
html_soup



  copy = BeautifulSoup(string_representation(soup))


## Identifying patterns and parsing necessary content

In [57]:
#find Post office name from "results" div first entry
post_name = html_soup.find('div', id="results").li.contents[2].strip()

In [58]:
#find Post office address from "results" div first entry
address = html_soup.find('div', id="results").li.contents[4].strip()

In [81]:
#find Post office City, PR and Postal Code from "results" div first entry
city_postal = html_soup.find('div', id="results").li.contents[6].strip().replace("\xa0", ",")

In [108]:
#Parse City and PR
city =city_postal[:-9].strip()

In [109]:
#Parse City
postal = city_postal[-8:].strip()

In [110]:
#create list to append arrays as dictionary
df_list=[]
df_list.append({'Post Office Name': post_name,
               'Address': address ,
               'City': city,
               'Postal Code': postal})

In [111]:
#create final dataframe with column names
df_out = pd.DataFrame(df_list, columns= ['Post Office Name','Address','City','Postal Code'])

In [112]:
#result
df_out

Unnamed: 0,Post Office Name,Address,City,Postal Code
0,TICKETS AND MORE,1571 SANDHURST CIRCLE UNIT 161,SCARBOROUGH ON,M1V 1V0


In [114]:
#find all individual entries withing results div - contained within separate "li" tags
postal_results = html_soup.find('div', id="results").find_all('li')

In [124]:
#tests results with loop and counter

count = 0
for postal in postal_results:
    count+=1
    print(count, postal)

1 <li> <span class="hidden"> 1 </span>TICKETS AND MORE
                  <br/>1571 SANDHURST CIRCLE UNIT 161
                  <br/>SCARBOROUGH ON   M1V 1V0   
                  <!--br/>
                                    <h:outputText value="0.0 km"/>
                                    <br/-->
<br/>Post Office
                  <p><a href="/cpotools/apps/fpo/personal/findPostOfficeDetail?outletId=0000104697">View details and directions</a></p>
</li>
2 <li> <span class="hidden"> 2 </span>FINCH MIDLAND PHARMACY
                  <br/>4190 FINCH AVE E
                  <br/>SCARBOROUGH ON   M1S 4T0   
                  <!--br/>
                                    <h:outputText value="0.0 km"/>
                                    <br/-->
<br/>Post Office
                  <p><a href="/cpotools/apps/fpo/personal/findPostOfficeDetail?outletId=0000357693">View details and directions</a></p>
</li>
3 <li> <span class="hidden"> 3 </span>MAIN DRUG MART
                  <br/>1711 MCCOWAN RD
  

## Final script and clean output table

In [133]:
#for loop to pull all results
df_list=[]

for postal in postal_results:
    post_name = postal.contents[2].strip()
    address = postal.contents[4].strip()
    city_postal = postal.contents[6].strip().replace("\xa0", ",")
    city =city_postal[:-9].strip()
    postal = city_postal[-8:].strip()
    #print(post_name)
    #print(address)
    #print(city_postal)
    #print(city)
    #print(postal)

    
    df_list.append({'Post Office Name': post_name,
               'Address': address ,
               'City': city,
               'Postal Code': postal})
   #print(df_list)
   #break

df_out = pd.DataFrame(df_list, columns= ['Post Office Name','Address','City','Postal Code'])

In [134]:
#final table with all PO boxes returned from query
df_out

Unnamed: 0,Post Office Name,Address,City,Postal Code
0,TICKETS AND MORE,1571 SANDHURST CIRCLE UNIT 161,SCARBOROUGH ON,M1V 1V0
1,FINCH MIDLAND PHARMACY,4190 FINCH AVE E,SCARBOROUGH ON,M1S 4T0
2,MAIN DRUG MART,1711 MCCOWAN RD,SCARBOROUGH ON,M1S 2Y0
3,A AND W PHARMACY,250 ALTON TOWERS CIR,SCARBOROUGH ON,M1V 3Z0
4,SCARBOROUGH STN D,280 PROGRESS AVE,SCARBOROUGH ON,M1P 2Z0
5,REXALL DRUGSTORE #0919,3607 SHEPPARD AVE E,SCARBOROUGH ON,M1T 3L0
6,SHOPPERS DRUG MART #0860,300 BOROUGH DR,SCARBOROUGH ON,M1P 4P0
7,SHOPPERS DRUG MART #0875,2355 WARDEN AVENUE,SCARBOROUGH ON,M1T 1V0
8,SHOPPERS DRUG MART #0980,2900 WARDEN AVE,SCARBOROUGH ON,M1W 2S0
9,SHOPPERS DRUG MART #1314,1235 MCCOWAN RD,SCARBOROUGH ON,M1H 3K0
