In [None]:
import ast

from pathlib import Path

import requests
import pandas as pd

from jinja2 import Template
from bs4 import BeautifulSoup

from sqlalchemy import create_engine

from settings import BASE_DIR, GITHUB_TOKEN

# Extracting jobs

In [None]:
FILTER = '&filterSearch=jobType_INTERNSHIP'

In [None]:
response = requests.get(
    url = (
        'https://www.bmwgroup.jobs/'
        'en/_jcr_content/main/layoutcontainer_5337/jobfinder30.jobfinder_table.content.html?'
        'rowIndex=1'
        '&blockCount=10' +
        FILTER
    )
)

In [None]:
html = response.content.decode('utf-8')

In [None]:
soup = BeautifulSoup(html, 'html.parser')

In [None]:
count = soup.find('div', class_ = 'grp-jobfinder__table').get('data-counter')

In [None]:
jobs = list()
for req in range(int(int(count) / 10)):
    response = requests.get(
        url = (
            'https://www.bmwgroup.jobs/'
            'en/_jcr_content/main/layoutcontainer_5337/jobfinder30.jobfinder_table.content.html?'
            f'rowIndex={req * 10}'
            '&blockCount=10' +
            FILTER
        )
    )

    html = response.content.decode('utf-8')

    soup = BeautifulSoup(html, 'html.parser')

    jobfider_wrapper = soup.find_all('div', class_='grp-jobfinder__wrapper')

    for job in jobfider_wrapper:
        jobs.append(
            {
                "title": job.find('a').get('aria-label'),
                "link": 'https://www.bmwgroup.jobs' + job.find('a').get('href'),
                "field": job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-field'),
                "entity": job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-legal-entity'),
                "city": job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-location'),
                "type": job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-type')
            }
        )
jobs = pd.DataFrame(jobs)

# Extracting descriptions

In [None]:
descriptions = list()
for index in jobs.index:
    descriptions.append(
        {
            "link": jobs.loc[index, ('link')],
            "html": requests.get(jobs.loc[index, ('link')]).content.decode('utf-8')
        }
    )

descriptions = pd.DataFrame(descriptions)

In [None]:
descriptions['html'] = (
    descriptions['html']
    .str.replace('\n', '')
    .str.replace('\t', '')
    .str.replace('\r', '')
    .str.replace('  ', '')
    .str.strip()
)

# Treating data

In [None]:
job_descriptions = jobs.merge(
    descriptions,
    how = 'left',
    on = 'link'
)

del jobs
del descriptions

In [None]:
job_descriptions.insert(
    loc = len(job_descriptions.columns),
    column = 'description',
    value = job_descriptions.apply(
        lambda row: (
            BeautifulSoup(row["html"], 'html.parser')
            .find('div', class_='container-layout container no-top-spacing no-bottom-spacing')
            .text
        ),
        axis = 1
    )
)

In [None]:
job_descriptions.drop(
    columns = ['html'],
    inplace = True
)

# Loading to database

In [None]:
job_descriptions.to_excel("../data/bmw_jobs.xlsx", index = False)

In [None]:
engine = create_engine(f"sqlite:///{BASE_DIR}/data/bmw_jobs.db")
job_descriptions.to_sql("job_descriptions", engine, index=False, if_exists="replace")