In [None]:
import sqlite3
from pathlib import Path
import requests
import pandas as pd
from bs4 import BeautifulSoup, Tag
from settings import BASE_DIR
from types_ import JobFilter

In [None]:
FILTER: JobFilter = ['jobType_INTERNSHIP', 'postingDate_7']
CREATE_TABLE_QUERY = Path('../sql/tables.sql').read_text()

# Extracting job list

In [None]:
def job_listing(row_index: int = 1, filters: JobFilter | None = None) -> BeautifulSoup:
    """"""

    filters = f'&filterSearch={",".join(filters)}' if filters else ''

    response = requests.get(
        url = (
            'https://www.bmwgroup.jobs/'
            'en/_jcr_content/main/layoutcontainer_5337/jobfinder30.jobfinder_table.content.html'
            f'?rowIndex={row_index}'
            '&blockCount=10'
            + filters
        )
    )

    html = response.content.decode('utf-8')

    return BeautifulSoup(html, 'html.parser')

## Finding the count of available jobs

In [None]:
response = job_listing()

In [None]:
count = response.find('div', class_ = 'grp-jobfinder__table').get('data-counter')

In [None]:
quantity_of_pages = int(count) // 10

## Extracting relevant jobs data

In [None]:
def get_job_page(link: str) -> str:
    """"""

    return requests.get(link).content.decode('utf-8')

In [None]:
Html = str

def get_job_description(job_page: Html) -> str:
    """"""

    description = (
        job_page
        .replace('\n', '')
        .replace('\t', '')
        .replace('\r', '')
        .replace('  ', '')
        .strip()
    )

    description = (
        BeautifulSoup(description, 'html.parser')
        .find('div', class_ = 'container-layout container no-top-spacing no-bottom-spacing')
        .text
    )

    return description

In [None]:
jobs = list()
for page in range(quantity_of_pages):
    bmw_job_listing = job_listing(page * 10, FILTER).find_all('div', class_ = 'grp-jobfinder__wrapper')

    for job_listed in bmw_job_listing:
        job_listed: Tag = job_listed

        link = 'https://www.bmwgroup.jobs/' + job_listed.find('a').get('href')
        html = get_job_page(link)
        description = get_job_description(html)

        jobs.append(
            {
                "job_portal_id":    job_listed.get('data-job-id'),
                "job_link":         link,
                "job_page_source":  html,
                "job_title":        job_listed.find('a').get('aria-label'),
                "job_description":  description,
                "job_posting_date": None,
                "job_type":         job_listed.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-type'),
                "job_field":        job_listed.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-field'),
                "job_city":         job_listed.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-location'),
            }
        )

    break

jobs = pd.DataFrame(jobs)

# Loading to database

In [None]:
with sqlite3.connect('../data/jobs.db') as connection:

    connection.executescript(CREATE_TABLE_QUERY)

    already_exists = pd.read_sql('SELECT job_portal_id FROM jobs', connection)

    jobs = jobs[jobs['job_portal_id'].isin(already_exists['job_portal_id']) == False]

    jobs.to_sql(
        name = 'jobs',
        con = connection,
        if_exists = 'append',
        index = False,
    )