In [4]:
import time
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.webdriver import WebDriver



In [6]:
def get_page_links(driver: WebDriver, wttj_url: str, links: list):
    driver.get(wttj_url)
    time.sleep(5)
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".ais-Hits-list-item"))
        )
        job_offers = driver.find_elements(By.CSS_SELECTOR, ".ais-Hits-list-item")

        for job in job_offers:
            try:
                link = job.find_element(By.TAG_NAME, "a").get_attribute('href')
                links.append(link)
            except NoSuchElementException:
                print("Failed to find link for one of the job offers.")
        
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".ais-Hits-list-item"))
        )
        
  
    except TimeoutException:
        print(f"Timed out waiting for job offers to load on {wttj_url}")

In [7]:

driver = webdriver.Chrome()

links = []
for i in range (1, 12):
    wttj_url = f'https://www.welcometothejungle.com/fr/jobs?refinementList%5Boffices.country_code%5D%5B%5D=FR&query=%22data%20engineer%22&page={i}'

    get_page_links(driver, wttj_url, links)
    

# driver.quit()

In [11]:
# pd.DataFrame(links, columns=['link']).to_csv('output/jobs_links.csv', index=False)

In [12]:
class Company:
    def __init__(self):
        self.name = None
        self.sector = None
        self.employees = None
        self.creation_year = None
        self.turnover = None
        self.mean_age = None

class JobOffer:
    def __init__(self):
        self.title = None
        self.company = None
        self.contract_type = None
        self.location = None
        self.salary = None
        self.remote_type = None
        self.starting_date = None
        self.require_experience = None
        self.education = None
        self.description = None
        self.profil_experience = None
        self.publication_date = None
        self.url_direct_offer = None

class JobScraper:
    def __init__(self, url):
        self.driver = webdriver.Chrome()
        self.url = url
        self.company = Company()
        self.job_offer = JobOffer()
        self.job_offer.url_direct_offer = url
        
    def scrap_company_info(self):
        WebDriverWait(self.driver, 5).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.sc-dQEtJz.kiMwlt"))
        )

        company_elements = self.driver.find_elements(By.CSS_SELECTOR, "div.sc-dQEtJz.kiMwlt")

        if company_elements:
            self.company.sector = company_elements[0].text
            self.company.employees = company_elements[1].text
            self.company.creation_year = company_elements[2].text.replace('Créée en ', '')
            
            
            for element in company_elements[3:]:
                information = element.text
                if "Chiffre d'affaires" in information:
                    self.company.turnover = information.replace("Chiffre d'affaires : ", '').strip()
                elif "Âge moyen" in information:
                    self.company.mean_age = information.replace('Âge moyen :', '').strip()
        else:
            print("Company informations not found.")
        
        return self.company
    
    def get_description(self, section_id):
        try:
            WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, f"[data-testid={section_id}]"))
            )
            
            view_more_btn = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "[data-testid='view-more-btn']"))
            )
            
            self.driver.execute_script("arguments[0].click();", view_more_btn)
        
            
            description = self.driver.find_element(By.CSS_SELECTOR, f"[data-testid='{section_id}']").text
            
            return description
        except Exception as e:
            return None        

    def get_publication_date(self):
        try:
            
            WebDriverWait(self.driver, 5).until(
            	EC.presence_of_all_elements_located((By.CSS_SELECTOR, "p.sc-ERObt.cvpaYF"))
            )
            
            datetime_value = self.driver.find_element(By.TAG_NAME, "time").get_attribute('datetime')
            print(datetime_value)
            return datetime_value[:10]
        except:
            return None
    
    def scrap_job_offer_info(self):
        self.job_offer.publication_date = self.get_publication_date()
        
        self.job_offer.title = self.driver.find_element(By.CSS_SELECTOR, "h2.sc-ERObt.fMYXdq.wui-text").text
            
        self.company.name = self.driver.find_element(By.CSS_SELECTOR, "span.sc-ERObt.kkLHbJ.wui-text").text
        
        self.job_offer.company = vars(self.company)

        WebDriverWait(self.driver, 5).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.sc-dQEtJz.iIerXh"))
        )        
        
        job_elements = self.driver.find_elements(By.CSS_SELECTOR, "div.sc-dQEtJz.iIerXh")
        
        if job_elements:
                
            self.job_offer.contract_type = job_elements[0].text
            self.job_offer.location = job_elements[1].text if "Salaire" not in job_elements[1].text else None
            self.job_offer.description = self.get_description('job-section-description')
            self.job_offer.profil_experience = self.get_description('job-section-experience')
            
            for element in job_elements[1:]:
                information = element.text
                if "Salaire" in information:
                    self.job_offer.salary = information.replace('Salaire :\n', '')
                elif "Télétravail" in information:
                    self.job_offer.remote_type = information
                elif "Début" in information:
                    self.job_offer.starting_date = information.replace('Début :', '').strip()
                elif "Expérience" in information:
                    self.job_offer.require_experience = information.replace('Expérience :', '').strip()
                elif "Éducation" in information:
                    self.job_offer.education = information.replace('Éducation :', '').strip()
        
        else:
            print("Job informations not found.")
            
        return self.job_offer
            
    def driver_get(self):
        self.driver.get(self.url)
    
    def scrape_job_details(self):
        self.driver.get(self.url)
        
        try:
            self.scrap_company_info()
            self.scrap_job_offer_info()
            
        except Exception as e:
            print(f"Error occurred: {e}")
        finally:
            self.driver.close()

        return self.job_offer

In [13]:
links = pd.read_csv('output/jobs_links.csv')['link'].to_list()

In [15]:

def scrape_job(link):
    scraper = JobScraper(link)
    job_details = scraper.scrape_job_details()
    return vars(job_details)

job_offers = Parallel(n_jobs=4)(delayed(scrape_job)(link) for link in tqdm(links, desc="Scraping job offers"))


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Scraping job offers: 100%|██████████| 330/330 [10:33<00:00,  1.92s/it]


In [22]:
df_jobs = pd.DataFrame(job_offers)
df_jobs["source"] = "wttj"
df_jobs.to_csv('output/job_offers_wttj.csv', index=False, encoding='utf-8-sig')

In [23]:
df_jobs

Unnamed: 0,title,company,contract_type,location,salary,remote_type,debut_date,require_experience,education,description,profil_experience,publication_date,url_direct_offer,source
0,Consultant.e Data Engineer Expérimenté.e,"{'name': 'THE INFORMATION LAB', 'sector': 'Log...",CDI,Paris,45K à 60K €,Télétravail fréquent,,,,Descriptif du poste\nQui sommes-nous ?\nVérita...,,2024-01-17,https://www.welcometothejungle.com/fr/companie...,wttj
1,Data Engineer,"{'name': 'LESAFFRE', 'sector': 'Pharmaceutique...",CDI,Marcq En Baroeul,Non spécifié,Télétravail occasionnel,,> 5 ans,,Descriptif du poste\nLesaffre Digital & Data a...,Profil recherché\nThere is no such thing as a ...,2024-02-23,https://www.welcometothejungle.com/fr/companie...,wttj
2,Data Engineer - Transport,"{'name': 'MP DATA', 'sector': 'Intelligence ar...",CDI,Boulogne-Billancourt,Non spécifié,Télétravail fréquent,05 février 2024,,,Descriptif du poste\nL’environnement data vous...,Profil recherché\nIngénieur d’une grande école...,2024-02-23,https://www.welcometothejungle.com/fr/companie...,wttj
3,Data Engineer Spark Scala H/F,"{'name': 'SKIILS', 'sector': 'Intelligence art...",CDI,,Non spécifié,Télétravail fréquent,,,,Descriptif du poste\n🪄Tu as le pouvoir de crée...,"Profil recherché\nEn 1er, un savoir être qui c...",2024-02-23,https://www.welcometothejungle.com/fr/companie...,wttj
4,"Data Engineer Nearshore | Spark, Scala, Kafka H/F","{'name': 'SKIILS', 'sector': 'Intelligence art...",Freelance,,200 à 300 € par jour,Télétravail total,23 février 2024,> 2 ans,Bac +5 / Master,Descriptif du poste\nTu as le pouvoir de créer...,"Profil recherché\nProfil recherché\nEn 1er, un...",2024-02-23,https://www.welcometothejungle.com/fr/companie...,wttj
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,Data Product Owner H/F - CDI,"{'name': 'SIAXPERIENCE', 'sector': 'Design, Or...",CDI,Paris,Non spécifié,Télétravail fréquent,,,Bac +5 / Master,Descriptif du poste\nDescription de l’entrepri...,Profil recherché\nQualifications :\nDiplômé(e)...,2024-01-04,https://www.welcometothejungle.com/fr/companie...,wttj
326,Talent Acquisition Manager - Tech & Produit,"{'name': 'INDY', 'sector': 'FinTech / InsurTec...",CDI,,45K à 53K €,Télétravail fréquent,04 mars 2024,> 1 an,,Descriptif du poste\nNotre mission dans l’équi...,Profil recherché\nLe·a candidat·e idéal·e pour...,2024-02-19,https://www.welcometothejungle.com/fr/companie...,wttj
327,Architecte Cloud Azure H/F,"{'name': 'CAPGEMINI', 'sector': 'IT / Digital,...",CDI,Paris,Non spécifié,Télétravail non autorisé,,> 7 ans,Bac +5 / Master,Descriptif du poste\nDescription de l’entrepri...,Profil recherché\nDescription du profil :\n\nP...,2024-02-17,https://www.welcometothejungle.com/fr/companie...,wttj
328,Talent Acquisition Specialist,"{'name': 'STRANGEBEE', 'sector': 'Cybersécurit...",Temps partiel,Paris,Non spécifié,Télétravail fréquent,19 février 2024,,,Descriptif du poste\nDans le cadre de notre cr...,Profil recherché\nDe formation supérieure (bac...,2024-02-12,https://www.welcometothejungle.com/fr/companie...,wttj


In [None]:
df_jobs = pd.read_csv('output/job_offers_wttj.csv')
df_jobs