In [1]:
from io import BytesIO
import os
import pathlib
import re
import time

from bs4 import BeautifulSoup
from lxml import html
import pandas as pd
import rarfile
import requests as rq

In [2]:
HCP_HOST = "https://www.hcp.ma"
ANNUAIRES_STATISTIQUES_ENDPOINT = "downloads/?tag=Annuaires+statistiques+du+Maroc"
ANNUAIRES_STATISTIQUES_URL = f"{HCP_HOST}/{ANNUAIRES_STATISTIQUES_ENDPOINT}"
DATA_PATH = pathlib.Path('/dataplatform_lab', 'lab', 'dwh_data')
EXTRACTS_PATH = pathlib.Path(DATA_PATH, 'extracts')

In [3]:
def get_io_annuaire(url:str) -> rarfile.RarFile:
    response = rq.get(url)
    return rarfile.RarFile(BytesIO(response.content))

def extract_annuaire(url, extraction_path:pathlib.Path):
    rar_archive = get_io_annuaire(url)
    rar_archive.extractall(extraction_path.as_posix())

def get_year_from_filename(filename):
    year = re.findall(r'année (?P<year>\d{4})', filename)[0]
    return year

def get_all_annuaires_urls():
    response = rq.get(ANNUAIRES_STATISTIQUES_URL)
    tree = html.fromstring(response.content)
    ls_file_elements = tree.xpath("//div[@class='titre_fichier']/a[contains(text(),'(Format Excel)')]")
    dc_annuaires = {
        get_year_from_filename(e.text):HCP_HOST+e.get('href') for e in ls_file_elements
    }
    return dc_annuaires

def extract_all_annuaires():
    dc_annuaires = get_all_annuaires_urls()

    for year, url in dc_annuaires.items():
        annuaire_folder_name = f"Annuaire Statistique {year}"
        annuaire_path = pathlib.Path(EXTRACTS_PATH, annuaire_folder_name)
        if not annuaire_path.exists():
            print(f'Extracting Annuaire Statistique {year} from URL {url}')
            extract_annuaire(url, annuaire_path)
        else:
            print(f'Annuaire Statistique {year} already exists in path {annuaire_path}')

In [5]:
extract_all_annuaires()

Extracting Annuaire Statistique 2022 from URL https://www.hcp.ma/file/235856/
Extracting Annuaire Statistique 2021 from URL https://www.hcp.ma/file/235429/
Extracting Annuaire Statistique 2020 from URL https://www.hcp.ma/file/235427/
Extracting Annuaire Statistique 2019 from URL https://www.hcp.ma/file/235425/
