In [None]:
import re
import sqlite3
from pathlib import Path
from datetime import date
import requests
import pandas as pd
from bs4 import BeautifulSoup, Tag
from types_ import JobFilter
from utils import get_job_description

In [None]:
CREATE_TABLE_QUERY = Path('../sql/tables.sql').read_text()
FILTER_QUERY = """SELECT job_portal_id FROM jobs WHERE job_link LIKE 'https://jobs.ferrari.com/%'"""

In [None]:
def get_job_listing(
        # row_index: int = 1,
        filters: JobFilter | None = None,
    ) -> BeautifulSoup:
    """This function fetches job listings from the BMW Group website.
    Args:
        filters (JobFilter | None): A list of filters to apply to the job search.
            If None, no filters are applied.
    Returns:
        BeautifulSoup: A BeautifulSoup object containing the listed jobs."""

    response = requests.get('https://jobs.ferrari.com/search/')

    html = response.content.decode('utf-8')

    return BeautifulSoup(html, 'html.parser').find('ul', class_ = 'container job-list').find_all('li')

In [None]:
def get_job_portal_id(tag: Tag) -> int:
    """Extracts the job portal ID from a BeautifulSoup Tag object.

    Args:
        tag (Tag): A BeautifulSoup Tag object containing the job listing.

    Returns:
        int: The job portal ID extracted from the tag's data-id attribute.
    """

    job_portal_id = str(tag.get('data-focus-tile'))
    job_portal_id = re.search(r'(\d+)', job_portal_id).group()

    return int(job_portal_id)

In [None]:
def get_job_posting_date(tag: Tag) -> date:
    """Extracts the job publishing date from the job information section.
    Args:
        tag (Tag): A BeautifulSoup Tag object containing the job information.
    Returns:
        date: The date when the job was published, as a datetime.date object."""

    date_str = tag.find('div', class_ = 'section-field date fontcolorb6a533a1')
    date_str: str = date_str.text.strip().replace('Date', '').strip()

    return pd.to_datetime(date_str).date()

In [None]:
def get_job_field(tag: Tag) -> str | None:
    """Extracts the job field from the job information section.
    Args:
        tag (Tag): A BeautifulSoup Tag object containing the job information.
    Returns:
        str: The job field as a string."""

    try:
        field = tag.find('div', class_ = 'section-field dept fontcolorb6a533a1').text
        field = str(field).strip().replace('Department', '').strip()

    except AttributeError:
        field = None

    return field

In [None]:
def get_job_city(tag: Tag) -> str | None:
    """Extracts the job city from the job information section.
    Args:
        tag (Tag): A BeautifulSoup Tag object containing the job information.
    Returns:
        str: The job city as a string."""

    try:
        city = tag.find('div', class_ = 'section-field location fontcolorb6a533a1').text
        city = str(city).strip().replace('Location', '').strip()

    except AttributeError:
        city = None

    return city

In [None]:
def get_job_title(tag: Tag) -> str | None:
    """Extracts the job title from the job listing.
    Args:
        tag (Tag): A BeautifulSoup Tag object containing the job listing.
    Returns:
        str: The job title as a string."""

    try:
        title = tag.find('a', class_ = 'jobTitle-link fontcolorb6a533a1').text.strip()

    except AttributeError:
        title = None


    return title


## Extracting jobs data

In [None]:
jobs = list()
for job in get_job_listing():
    job: Tag = job

    link = 'https://jobs.ferrari.com/search' + job.get('data-url')

    jobs.append(
        {
            "job_portal_id":    get_job_portal_id(job),
            "job_link":         link,
            "job_title":        get_job_title(job),
            "job_description":  get_job_description(link),
            "job_posting_date": get_job_posting_date(job),
            "job_type":         None,
            "job_field":        get_job_field(job),
            "job_city":         get_job_city(job),
        }
    )

jobs = pd.DataFrame(jobs)
display(jobs)

# Loading to database

In [None]:
with sqlite3.connect('../data/jobs.db') as connection:

    connection.executescript(CREATE_TABLE_QUERY)

    already_exists = pd.read_sql(FILTER_QUERY, connection)
    already_exists = already_exists['job_portal_id']

    jobs = jobs[~jobs['job_portal_id'].isin(already_exists)]

    jobs.to_sql(
        name = 'jobs',
        con = connection,
        if_exists = 'append',
        index = False,
    )