In [4]:
import torch
import requests
import transformers
import pandas as pd
import huggingface_hub as hh

from bs4 import BeautifulSoup

from settings import BASE_DIR, HUGGINGFACE_TOKEN

# Step 1 Extracting all jobs

In [5]:
FILTER = '&filterSearch=jobType_INTERNSHIP'

In [6]:
response = requests.get(
    url = (
        'https://www.bmwgroup.jobs/'
        'en/_jcr_content/main/layoutcontainer_5337/jobfinder30.jobfinder_table.content.html?'
        'rowIndex=1'
        '&blockCount=10' +
        FILTER
    )
)

In [7]:
html = response.content.decode('utf-8')

In [8]:
soup = BeautifulSoup(html, 'html.parser')

In [9]:
count = soup.find('div', class_ = 'grp-jobfinder__table').get('data-counter')

In [10]:
count

'925'

In [11]:
jobs = list()
for req in range(int(int(count) / 10)):
    response = requests.get(
        url = (
            'https://www.bmwgroup.jobs/'
            'en/_jcr_content/main/layoutcontainer_5337/jobfinder30.jobfinder_table.content.html?'
            f'rowIndex={req * 10}'
            '&blockCount=10' +
            FILTER
        )
    )

    html = response.content.decode('utf-8')

    soup = BeautifulSoup(html, 'html.parser')

    jobfider_wrapper = soup.find_all('div', class_='grp-jobfinder__wrapper')

    for job in jobfider_wrapper:
        jobs.append(
            {
                "title": job.find('a').get('aria-label'),
                "link": 'https://www.bmwgroup.jobs' + job.find('a').get('href'),
                "field": job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-field'),
                "entity": job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-legal-entity'),
                "city": job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-location'),
                "type": job.find('a').find('div', class_='grp-jobfinder-cell-refno').get('data-job-type')
            }
        )
jobs = pd.DataFrame(jobs)

# Step 2: Extracting all jobs descriptions

In [12]:
descriptions = list()
for index in jobs.index:
    descriptions.append({
        "link": jobs.loc[index, ('link')],
        "html": requests.get(jobs.loc[0, ('link')]).content.decode('utf-8')
    })

descriptions = pd.DataFrame(descriptions)

In [13]:
descriptions['html'] = (
    descriptions['html']
    .str.replace('\n', '')
    .str.replace('\t', '')
    .str.replace('\r', '')
    .str.replace('  ', '')
    .str.strip()
)

In [14]:
job_descriptions = jobs.merge(
    descriptions,
    how = 'left',
    on = 'link'
)

del jobs
del descriptions

In [16]:
job_descriptions.to_excel(BASE_DIR / 'data/bmw_jobs.xlsx', index = False)