In [100]:
import re
import sqlite3
from pathlib import Path
from datetime import date
import requests
import pandas as pd
from bs4 import BeautifulSoup, Tag
from settings import BASE_DIR
from types_ import JobFilter

In [None]:
FILTER: JobFilter = ['jobType_INTERNSHIP', 'postingDate_7']
CREATE_TABLE_QUERY = Path('../sql/tables.sql').read_text()

In [102]:
def get_job_listing(
        # row_index: int = 1,
        filters: JobFilter | None = None,
    ) -> BeautifulSoup:
    """"""

    url = (
        'https://www.bmwgroup.jobs/'
        'en/_jcr_content/main/layoutcontainer_5337/jobfinder30.jobfinder_table.content.html?'
        f'filterSearch={",".join(filters)}' if filters else ''
    )

    response = requests.get(url)

    html = response.content.decode('utf-8')

    return BeautifulSoup(html, 'html.parser').find_all('div', class_ = 'grp-jobfinder__wrapper')

In [103]:
def get_job_page(link: str) -> str:
    """"""

    return requests.get(link).content.decode('utf-8')

In [104]:
Html = str

def get_job_description(job_page: Html) -> str:
    """TODO: Extract job description with all the bullets and page formatting, but as text."""

    description = (
        job_page
        .replace('\n', '')
        .replace('\t', '')
        .replace('\r', '')
        .replace('  ', '')
        .strip()
    )

    description = (
        BeautifulSoup(description, 'html.parser')
        .find('div', class_ = 'container-layout container no-top-spacing no-bottom-spacing')
        .text
    )

    return description

In [105]:
def get_job_publishing_date(job_infos: Tag) -> date:
    """"""

    date_str = job_infos.find('div', class_ = 'grp-jobfinder__cell-publication').text.strip()
    date_str = re.search(r'\d{2}\.\d{2}\.\d{4}', date_str).group()

    return pd.to_datetime(date_str, format = '%d.%m.%Y').date()

## Extracting jobs data

In [106]:
jobs = list()
for job in get_job_listing(FILTER):
    job: Tag = job

    link = 'https://www.bmwgroup.jobs/' + job.find('a').get('href')
    html = get_job_page(link)

    jobs.append(
        {
            "job_portal_id":    job.get('data-job-id'),
            "job_link":         link,
            "job_page_source":  html,
            "job_title":        job.find('a').get('aria-label'),
            "job_description":  get_job_description(html),
            "job_posting_date": get_job_publishing_date(job),
            "job_type":         job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-type'),
            "job_field":        job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-field'),
            "job_city":         job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-location'),
        }
    )

jobs = pd.DataFrame(jobs)
display(jobs)

Unnamed: 0,job_portal_id,job_link,job_page_source,job_title,job_description,job_posting_date,job_type,job_field,job_city
0,160441,https://www.bmwgroup.jobs//en/jobfinder/job-de...,"\n\n\n\n\n<!DOCTYPE HTML>\n<html lang=""en"">\n<...",Praktikant Social Media und Website (w/m/x),EIN GUTES PRAKTIKUM IST PRAKTISCH NIE THEORETI...,2025-06-06,Internship,External Corporate Communication,Munich
1,160337,https://www.bmwgroup.jobs//en/jobfinder/job-de...,"\n\n\n\n\n<!DOCTYPE HTML>\n<html lang=""en"">\n<...",Praktikant Versuch Antriebsentwicklung BMW M (...,EIN GUTES PRAKTIKUM IST PRAKTISCH NIE THEORETI...,2025-06-06,Internship,Drive Train,Munich
2,160305,https://www.bmwgroup.jobs//en/jobfinder/job-de...,"\n\n\n\n\n<!DOCTYPE HTML>\n<html lang=""en"">\n<...",Praktikant Kommunikation Digital Experience (w...,EIN GUTES PRAKTIKUM IST PRAKTISCH NIE THEORETI...,2025-06-06,Internship,External Corporate Communication,Munich
3,160283,https://www.bmwgroup.jobs//en/jobfinder/job-de...,"\n\n\n\n\n<!DOCTYPE HTML>\n<html lang=""en"">\n<...",Praktikant Social Media Analyse (w/m/x),EIN GUTES PRAKTIKUM IST PRAKTISCH NIE THEORETI...,2025-06-06,Internship,External Corporate Communication,Munich
4,160279,https://www.bmwgroup.jobs//en/jobfinder/job-de...,"\n\n\n\n\n<!DOCTYPE HTML>\n<html lang=""en"">\n<...","Praktikant Social Media, Unternehmenskommunika...",EIN GUTES PRAKTIKUM IST PRAKTISCH NIE THEORETI...,2025-06-06,Internship,Internal Corporate Communication,Munich
...,...,...,...,...,...,...,...,...,...
741,146147,https://www.bmwgroup.jobs//en/jobfinder/job-de...,"\n\n\n\n\n<!DOCTYPE HTML>\n<html lang=""en"">\n<...",Praktikant Digitaler Zwilling für die Anwendun...,EIN GUTES PRAKTIKUM IST PRAKTISCH NIE THEORETI...,2024-12-13,Internship,Advanced Development/Research,Munich
742,135635,https://www.bmwgroup.jobs//en/jobfinder/job-de...,"\n\n\n\n\n<!DOCTYPE HTML>\n<html lang=""en"">\n<...",Praktikant Netzwerk- und Prozessplanung (w/m/x),EIN GUTES PRAKTIKUM IST PRAKTISCH NIE THEORETI...,2024-12-13,Internship,Drive Train,Munich
743,147976,https://www.bmwgroup.jobs//en/jobfinder/job-de...,"\n\n\n\n\n<!DOCTYPE HTML>\n<html lang=""en"">\n<...",Praktikant Förderprojekte (w/m/x),EIN GUTES PRAKTIKUM IST PRAKTISCH NIE THEORETI...,2024-12-11,Internship,Advanced Development/Research,Munich
744,148056,https://www.bmwgroup.jobs//en/jobfinder/job-de...,"\n\n\n\n\n<!DOCTYPE HTML>\n<html lang=""en"">\n<...",Praktikant Digitalisierung &amp; Softwareentwi...,THEORETISCH DIE BESTE ENTSCHEIDUNG. PRAKTISCH ...,2024-12-03,Internship,Drive Train,Steyr


# Loading to database

In [107]:
with sqlite3.connect('../data/jobs.db') as connection:

    connection.executescript(CREATE_TABLE_QUERY)

    already_exists = pd.read_sql('SELECT job_portal_id FROM jobs', connection)

    jobs = jobs[~jobs['job_portal_id'].isin(already_exists['job_portal_id'])]

    jobs.to_sql(
        name = 'jobs',
        con = connection,
        if_exists = 'append',
        index = False,
    )