In [None]:
import re
import sqlite3
from pathlib import Path
from datetime import date
import requests
import pandas as pd
from bs4 import BeautifulSoup, Tag
from types_ import JobFilter

In [None]:
FILTER: JobFilter = ['jobType_INTERNSHIP', 'postingDate_7']
CREATE_TABLE_QUERY = Path('../sql/tables.sql').read_text()

In [None]:
def get_job_listing(
        # row_index: int = 1,
        filters: JobFilter | None = None,
    ) -> BeautifulSoup:
    """This function fetches job listings from the BMW Group website.
    Args:
        filters (JobFilter | None): A list of filters to apply to the job search.
            If None, no filters are applied.
    Returns:
        BeautifulSoup: A BeautifulSoup object containing the listed jobs."""

    url = (
        'https://www.bmwgroup.jobs/'
        'en/_jcr_content/main/layoutcontainer_5337/jobfinder30.jobfinder_table.content.html?'
        f'filterSearch={",".join(filters)}' if filters else ''
    )

    response = requests.get(url)

    html = response.content.decode('utf-8')

    return BeautifulSoup(html, 'html.parser').find_all('div', class_ = 'grp-jobfinder__wrapper')

In [None]:
def get_job_page(link: str) -> str:
    """Simply fetches the job page content from the given link.
    Args:
        link (str): The URL of the job page.
    Returns:
        str: The HTML content of the job page."""

    return requests.get(link).content.decode('utf-8')

In [None]:
Html = str

def get_job_description(job_page: Html) -> Html:
    """This function extracts the job description from the job page HTML, but keeps the formatting.
    Args:
        job_page (Html): The HTML content of the job page.
    Returns:
        str: The formatted job description as a html string."""

    description =  BeautifulSoup(job_page, 'html.parser').find('div', class_ = 'container-layout container no-top-spacing no-bottom-spacing')

    # Remove all attributes that are not needed since i just need the formatted HTML.
    for tag in description.find_all(True):
        tag: Tag = tag
        if 'class' in tag.attrs:
            del tag.attrs['class']
        if 'itemprop' in tag.attrs:
            del tag.attrs['itemprop']

    return (
        description.decode_contents()
        .replace('\n', '')
        .replace('\t', '')
        .replace('\r', '')
        .strip()
    )

In [None]:
def get_job_publishing_date(job_infos: Tag) -> date:
    """Extracts the job publishing date from the job information section.
    Args:
        job_infos (Tag): A BeautifulSoup Tag object containing the job information.
    Returns:
        date: The date when the job was published, as a datetime.date object."""

    date_str = job_infos.find('div', class_ = 'grp-jobfinder__cell-publication').text.strip()
    date_str = re.search(r'\d{2}\.\d{2}\.\d{4}', date_str).group()

    return pd.to_datetime(date_str, format = '%d.%m.%Y').date()

## Extracting jobs data

In [None]:
jobs = list()
for job in get_job_listing(FILTER):
    job: Tag = job

    link = 'https://www.bmwgroup.jobs/' + job.find('a').get('href')
    html = get_job_page(link)

    jobs.append(
        {
            "job_portal_id":    job.get('data-job-id'),
            "job_link":         link,
            "job_title":        job.find('a').get('aria-label'),
            "job_description":  get_job_description(html),
            "job_posting_date": get_job_publishing_date(job),
            "job_type":         job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-type'),
            "job_field":        job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-field'),
            "job_city":         job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-location'),
        }
    )

jobs = pd.DataFrame(jobs)
display(jobs)

# Loading to database

In [None]:
with sqlite3.connect('../data/jobs.db') as connection:

    connection.executescript(CREATE_TABLE_QUERY)

    already_exists = pd.read_sql('SELECT job_portal_id FROM jobs', connection)
    already_exists = already_exists['job_portal_id'].astype(int)

    jobs = jobs[
        ~jobs['job_portal_id'].astype(int)
        .isin(already_exists)
    ]

    jobs.to_sql(
        name = 'jobs',
        con = connection,
        if_exists = 'append',
        index = False,
    )