In [2]:
import requests
from bs4 import BeautifulSoup
import bs4
import pandas as pd

In [3]:
url_en = "http://www.veterans.gc.ca/eng/contact/map"
url_fr = "http://www.veterans.gc.ca/fra/contact/map"

resp_en = requests.get(url_en)
resp_fr = requests.get(url_fr)

text_en = resp_en.text
text_fr = resp_fr.text

In [11]:
soup_en = BeautifulSoup(text_en, 'html.parser')
soup_fr = BeautifulSoup(text_fr, 'html.parser')

In [12]:
def get_data_from_details(details):
    office_name_w_whitespace = details.summary.text
    office_name = office_name_w_whitespace.strip()
    p = details.find("p")
    address = ', '.join([x for x in p.contents if type(x) != bs4.element.Tag])    
    a = details.find("a")    
    latLngStr = a.attrs['href'].split('(')[-1].split(')')[0].split(',')[:2]   
    lat = float(latLngStr[0]) 
    lng = float(latLngStr[1])
    return {
        'office_name': office_name,
        'address': address,
        'lat': lat,
        'lng': lng
           }

In [13]:
office_data_en = []
office_data_fr = []

for details in soup_en.find_all('details'):
    try:
        office_data_en.append(get_data_from_details(details))
    except:
        print('exception!')
        
for details in soup_fr.find_all('details'):
    try:
        office_data_fr.append(get_data_from_details(details))
    except:
        print('exception!')

In [29]:
office_data_en_2 = [x for x in office_data_en if 'office' in x['office_name'].lower()]
office_data_fr_2 = [x for x in office_data_fr if 
                    (
                        'bureau' in x['office_name'].lower() 
                        and 'pensions' not in x['office_name'].lower() 
                        and 'BSJP' not in x['office_name']
                        and 'CISP' not in x['office_name']
                    )
                   ]

In [30]:
len(office_data_fr_2)

38

In [35]:
coords_en = [(x['lat'], x['lng']) for x in office_data_en_2]
coords_fr = [(x['lat'], x['lng']) for x in office_data_fr_2]

In [41]:
assert(len(set(coords_en + coords_fr)) == len(set(coords_en)))
assert(len(set(coords_en + coords_fr)) == len(set(coords_fr)))      

In [42]:
# we can use coords as the id to match english and french entries

In [43]:
office_data = {}
for x in office_data_en_2:
    coords = (x['lat'], x['lng'])
    if coords not in office_data:
        office_data[coords] = {}
    office_data[coords]['name_en'] = x['office_name']
    office_data[coords]['address_en'] = x['address']
    office_data[coords]['lng'] = x['lng']
    office_data[coords]['lat'] = x['lat']
    
    
for x in office_data_fr_2:
    coords = (x['lat'], x['lng'])
    if coords not in office_data:
        office_data[coords] = {}
    office_data[coords]['name_fr'] = x['office_name']
    office_data[coords]['address_fr'] = x['address']

In [47]:
df = pd.DataFrame.from_dict(list(office_data.values()))

In [49]:
df.head()

Unnamed: 0,address_en,address_fr,name_en,name_fr
0,"Bantrel Tower , 700 6, Avenue Southwest, 7, ...","Tour Bantrel, 700, 6, Sud-ouest, 7, étage, C...",Calgary Office,Bureau de Calgary
1,"940 Canada Place, 9700 Jasper Avenue, Edmonton...","940 Canada Place, 9700, avenue Jasper, Edmonto...",Edmonton Office,Bureau d'Edmonton
2,"471 Queensway Avenue,, 3, Floor, Suite 313, K...","471, avenue Queensway,, 3, étage, bureau 313,...",Kelowna Office,Bureau de Kelowna
3,"60 Nanaimo Avenue West , Penticton, BC","60, avenue Nanaimo Ouest , Penticton (Colombie...",Penticton Office,Bureau de Penticton
4,"299 Victoria Street, 4, Floor, Suite 435, Pri...","299, rue Victoria, 4, étage, bureau 435, Prin...",Prince George Office,Bureau de Prince George


In [50]:
df.to_csv("vac_office_data.csv")