In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
url = 'https://www.canada.ca/en/health-canada/corporate/transparency/corporate-management-reporting/evaluation.html'  #first page to scrape
response = requests.get(url)
content = response.content
soup = BeautifulSoup(content, 'html.parser') #load html of the main site

hrefs = []
names = []

In [3]:
eval_patterns = [
    'evaluation/results-',
    'evaluation/summary',
    'evaluation-reports/',
    'evaluations/',
    'reporting/summary',
    'reporting/evaluation',
    'publications.gc.ca/'
]
def eval_url(url, patterns=eval_patterns):
    for pattern in patterns:
        if pattern in url:
            return 'https://www.canada.ca/' + url
    return False

In [4]:
for link in soup.find_all('a'): #iterate through all links of the mainsite url (for each year)
    href = link.get('href') # get a list of sub-urls for reports for each year
    if href and (href := eval_url(href)):
        hrefs.append(href)
        names.append(link.text)

hrefs = list(set(hrefs)) #remove duplicates

In [6]:
import os
import glob
def clear_folder(folder):
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        if os.path.isfile(file_path):
            os.unlink(file_path)
base_dir = '../../data/raw'
clear_folder(base_dir)

In [7]:
from bs4 import BeautifulSoup
import requests

def load_full_text(url):
    # Make a request to the URL and retrieve the HTML content
    response = requests.get(url)
    content = response.content

    # Parse the HTML content
    soup = BeautifulSoup(content, 'html.parser')

    # Extract all the text
    section_identifier = 'main[property="mainContentOfPage"][resource="#wb-main"][typeof="WebPageElement"][class="container"]'
    section = soup.select_one(section_identifier) #select texts only from a specific section
    all_text = section.get_text()
    return all_text

success = 0
for i, url in enumerate(hrefs):
    name = names[i]
    #get content
    try:
        full_text = load_full_text(url)
    except Exception as e:
        print(f'{str(i).zfill(3)} - Failed to load "{url}"')
        print(e)
        continue
    #write content
    try:
        fp = open(f'{base_dir}/hc_{str(i).zfill(3)}_{name}.txt', 'w') #write each evaluation report into a separate .txt file
        fp.write(full_text)
        fp.close()
        success += 1
        print(f'{str(i).zfill(3)} - Successfully wrote "{name}"')
    except Exception as e:
        print(f'{str(i).zfill(3)} - Failed to write "{name}"')
        print(e)
        fp.close()
        if os.path.exists(fp.name):
            os.remove(fp.name)
print(f'Successfully wrote {success}/{len(hrefs)} files')

000 - Successfully wrote "At a Glance - Departmental Evaluation Plan for Health Canada, 2022-23 to 2026-27"
001 - Successfully wrote "At a Glance: Departmental Evaluation Plan for Health Canada 2021-22 to 2025-26"
002 - Successfully wrote "At a Glance: Departmental Evaluation Plan for Health Canada 2020-21 to 2024-25"
003 - Successfully wrote "At a Glance- Departmental Evaluation Plan for the Public Health Agency of Canada and Health Canada, 2019-20 to 2023-24"
004 - Successfully wrote "Results at a glance"
005 - Successfully wrote "Evaluation report"
006 - Successfully wrote "Results at a glance"
007 - Successfully wrote "Evaluation report"
008 - Successfully wrote "Results at a glance"
009 - Successfully wrote "Evaluation Report"
010 - Successfully wrote "Results at a glance"
011 - Successfully wrote "Evaluation Report"
012 - Failed to write "Results at a glance"
'charmap' codec can't encode character '\u2212' in position 243849: character maps to <undefined>
013 - Successfully wrote