In [1]:
import time
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.webdriver import WebDriver

# Setup WebDriver
# service = Service(ChromeDriverManager().install())


In [3]:
def get_page_links(driver: WebDriver, wttj_url: str, links: list):
    driver.get(wttj_url)
    time.sleep(5)
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".ais-Hits-list-item"))
        )
        job_offers = driver.find_elements(By.CSS_SELECTOR, ".ais-Hits-list-item")

        for job in job_offers:
            try:
                link = job.find_element(By.TAG_NAME, "a").get_attribute('href')
                links.append(link)
            except NoSuchElementException:
                print("Failed to find link for one of the job offers.")
        
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".ais-Hits-list-item"))
        )
        
  
    except TimeoutException:
        print(f"Timed out waiting for job offers to load on {wttj_url}")

In [None]:
# path = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome()

links = []
for i in range (1, 12):
    wttj_url = f'https://www.welcometothejungle.com/fr/jobs?refinementList%5Boffices.country_code%5D%5B%5D=FR&query=%22data%20engineer%22&page={i}'

    get_page_links(driver, wttj_url, links)
    

# driver.quit()

In [4]:
class Company:
    def __init__(self):
        self.name = None
        self.sector = None
        self.employees = None
        self.creation_year = None
        self.turnover = None
        self.mean_age = None

class JobOffer:
    def __init__(self):
        self.title = None
        self.company = None
        self.contract_type = None
        self.location = None
        self.salary = None
        self.remote_type = None
        self.debut_date = None
        self.require_experience = None
        self.education = None
        self.description = None
        self.profil_experience = None

class JobScraper:
    def __init__(self, url):
        self.driver = webdriver.Chrome()
        self.url = url
        self.company = Company()
        self.job_offer = JobOffer()
        
    def scrap_company_info(self):
        WebDriverWait(self.driver, 5).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.sc-dQEtJz.kiMwlt"))
        )

        company_elements = self.driver.find_elements(By.CSS_SELECTOR, "div.sc-dQEtJz.kiMwlt")

        if company_elements:
            self.company.sector = company_elements[0].text
            self.company.employees = company_elements[1].text
            self.company.creation_year = company_elements[2].text.replace('Créée en ', '')
            
            
            for element in company_elements[3:]:
                information = element.text
                if "Chiffre d'affaires" in information:
                    self.company.turnover = information.replace("Chiffre d'affaires : ", '').strip()
                elif "Âge moyen" in information:
                    self.company.mean_age = information.replace('Âge moyen :', '').strip()
        else:
            print("Company informations not found.")
        
        return self.company
    
    def get_description(self, section_id):
        try:
            WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, f"[data-testid={section_id}]"))
            )
            
            view_more_btn = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "[data-testid='view-more-btn']"))
            )
            
            self.driver.execute_script("arguments[0].click();", view_more_btn)
        
            
            description = self.driver.find_element(By.CSS_SELECTOR, f"[data-testid='{section_id}']").text
            
            return description
        except Exception as e:
            return None        

    
    def scrap_job_offer_info(self):
        self.job_offer.title = self.driver.find_element(By.CSS_SELECTOR, "h2.sc-ERObt.fMYXdq.wui-text").text
            
        self.company.name = self.driver.find_element(By.CSS_SELECTOR, "span.sc-ERObt.kkLHbJ.wui-text").text
        
        self.job_offer.company = vars(self.company)
        
        WebDriverWait(self.driver, 5).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.sc-dQEtJz.iIerXh"))
        )
        
        
        job_elements = self.driver.find_elements(By.CSS_SELECTOR, "div.sc-dQEtJz.iIerXh")
        
        if job_elements:
                
            self.job_offer.contract_type = job_elements[0].text
            self.job_offer.location = job_elements[1].text
            self.job_offer.salary = job_elements[2].text.replace('Salaire :\n', '')
            self.job_offer.description = self.get_description('job-section-description')
            self.job_offer.profil_experience = self.get_description('job-section-experience')
            
            for element in job_elements[3:]:
                information = element.text
                if "Télétravail" in information:
                    self.job_offer.remote_type = information
                elif "Début" in information:
                    self.job_offer.debut_date = information.replace('Début :', '').strip()
                elif "Expérience" in information:
                    self.job_offer.require_experience = information.replace('Expérience :', '').strip()
                elif "Éducation" in information:
                    self.job_offer.education = information.replace('Éducation :', '').strip()
            
        else:
            print("Job informations not found.")
            
        return self.job_offer
            
    def driver_get(self):
        self.driver.get(self.url)
    
    def scrape_job_details(self):
        self.driver.get(self.url)
        
        try:
            self.scrap_company_info()
            self.scrap_job_offer_info()
            
        except Exception as e:
            print(f"Error occurred: {e}")
        finally:
            self.driver.close()

        return self.job_offer

In [6]:
links = pd.read_csv('jobs_links.csv')['link'].to_list()

In [7]:

def scrape_job(link):
    scraper = JobScraper(link)
    job_details = scraper.scrape_job_details()
    return vars(job_details)

job_offers = Parallel(n_jobs=4)(delayed(scrape_job)(link) for link in tqdm(links, desc="Scraping job offers"))

Scraping job offers: 100%|██████████| 200/200 [10:44<00:00,  3.22s/it]


In [None]:
df_jobs = pd.DataFrame(job_offers)
# df_jobs.to_csv('job_offers_wttj.csv', index=False)

In [2]:
df_jobs = pd.read_csv('output/job_offers_wttj.csv')
df_jobs

Unnamed: 0,title,company,contract_type,location,salary,remote_type,debut_date,require_experience,education,description,profil_experience
0,Consultant.e Data Engineer Expérimenté.e,"{'name': 'THE INFORMATION LAB', 'sector': 'Log...",CDI,Paris,45K à 60K €,Télétravail fréquent,,,,Descriptif du poste\nQui sommes-nous ?\nVérita...,
1,Alternance (Bac +5) - Data Engineer (F/H/X),"{'name': 'IADVIZE', 'sector': 'IT / Digital, É...",Alternance,Nantes,Non spécifié,Télétravail fréquent,,> 6 mois,,Descriptif du poste\nVous rejoindrez l'équipe ...,Profil recherché\nProfil recherché\nEn dernièr...
2,Margo Analytics - Data Engineer - H/F,"{'name': 'MARGO', 'sector': 'Logiciels, IT / D...",CDI,Paris,Non spécifié,Télétravail fréquent,,,Bac +5 / Master,Descriptif du poste\nMargo Analytics est l'ent...,
3,Data Engineer Junior H/F,"{'name': 'MEILLEURTAUX', 'sector': 'FinTech / ...",CDI,Paris,Non spécifié,Télétravail non autorisé,,,,Descriptif du poste\nVous souhaitez rejoindre ...,Profil recherché\nPourquoi êtes-vous notre TOP...
4,Data Engineer,"{'name': 'ADVANCED SCHEMA', 'sector': 'IT / Di...",CDI,Paris,Non spécifié,Télétravail non autorisé,,> 1 an,Bac +3,,
...,...,...,...,...,...,...,...,...,...,...,...
307,Data Engineer Senior - CDI - Paris ou Caen,"{'name': 'JAKALA', 'sector': 'Digital Marketin...",CDI,Salaire :\nNon spécifié,Télétravail fréquent,,,> 5 ans,Bac +5 / Master,Descriptif du poste\nAu sein de notre Data Lab...,Profil recherché\nDiplômé·e d’études supérieur...
308,Data Architect / Data Engineer - CDI - Paris,"{'name': 'JAKALA', 'sector': 'Digital Marketin...",CDI,Paris,Non spécifié,Télétravail fréquent,,> 5 ans,Bac +5 / Master,Descriptif du poste\nSOYHUCE est à la recherch...,"Profil recherché\nDiplôme Bac + 5, ingénieur o..."
309,Data Engineer GCP (H/F),"{'name': 'THALES', 'sector': 'Logiciels, Cyber...",CDI,Bordeaux,Non spécifié,Télétravail non autorisé,,,,Descriptif du poste\nQUI SOMMES-NOUS ?\nNos éq...,
310,STAGE - Data Engineer,"{'name': 'AXA', 'sector': 'Banque, Assurance, ...",Stage,Paris,Non spécifié,Télétravail fréquent,,,,Descriptif du poste\nLa mission d’AXA est de «...,Profil recherché\nPROFIL : Etudiant en école d...


In [None]:
# url = "https://www.welcometothejungle.com/fr/companies/datascientest/jobs/ingenieur-systeme-et-securite-h-f-poei_puteaux?q=de74fa0bc2ac97695ef455fe374debea&o=3602f6b9-b641-4835-9a0f-38c113331334"
# url =links[0]
# scraper = JobScraper(url)
# job_offer = JobOffer()
# company = Company()
# scraper.__init__(url)
# scraper.driver_get()
# job_details = scraper.get_description('job-section-description')
# vars(job_details)