<a href="https://colab.research.google.com/github/YacineBenameur/Data-Jobs-Analysis-in-France/blob/main/webscraping_and_dataset_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Specify openai api key

In [None]:
openai_api_key = "YOUR_openai_api_key"

# Install and import the necessary liraries

In [None]:
from IPython.display import clear_output

!pip install selenium

clear_output()

In [None]:
import os
import sys
import re
import warnings
import pickle
import json

# IPython utilities
from IPython.display import Image, display, clear_output

# Web scraping and automation
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# Data manipulation and numerical computing
import numpy as np
import pandas as pd

# Pydantic for data validation and typing
from pydantic import BaseModel, Field
from typing import List, Optional, Tuple

# OpenAI API
from openai import OpenAI

# Progress bar
from tqdm import tqdm

warnings.filterwarnings("ignore")





# Setup the headless browser and the desired links

In [None]:
data_scientist_link = "https://www.hellowork.com/fr-fr/emploi/recherche.html?k=Data+scientist&k_autocomplete=http%3A%2F%2Fwww.rj.com%2FCommun%2FPost%2FData_scientist&l=&l_autocomplete=&st=date&c=CDI&c=CDD&c=Independant&cod=all&d=all"
data_analyst_link = "https://www.hellowork.com/fr-fr/emploi/recherche.html?k=Data+analyst&k_autocomplete=http%3A%2F%2Fwww.rj.com%2FCommun%2FPost%2FAnalyste_donnees&l=&l_autocomplete=&st=date&c=CDI&c=CDD&c=Independant&cod=all&d=all"
data_engineer_link = "https://www.hellowork.com/fr-fr/emploi/recherche.html?k=Data+engineer&k_autocomplete=http%3A%2F%2Fwww.rj.com%2FCommun%2FPost%2FAnalyste_donnees&l=&l_autocomplete=&st=date&c=CDI&c=CDD&c=Independant&cod=all&d=all"
business_intelligence_link = "https://www.hellowork.com/fr-fr/emploi/recherche.html?k=business+intelligence&k_autocomplete=http%3A%2F%2Fwww.rj.com%2FCommun%2FPost%2FAnalyste_donnees&l=&l_autocomplete=&st=date&c=CDI&c=CDD&c=Independant&cod=all&d=all"
machine_learning_link = "https://www.hellowork.com/fr-fr/emploi/recherche.html?k=machine+learning&k_autocomplete=http%3A%2F%2Fwww.rj.com%2FCommun%2FPost%2FAnalyste_donnees&l=&l_autocomplete=&st=date&c=CDI&c=CDD&c=Independant&cod=all&d=all"
mlops_link = "https://www.hellowork.com/fr-fr/emploi/recherche.html?k=mlops&k_autocomplete=http%3A%2F%2Fwww.rj.com%2FCommun%2FPost%2FAnalyste_donnees&l=&l_autocomplete=&st=date&c=CDI&c=CDD&c=Independant&cod=all&d=all"

jobs_search_links = [data_scientist_link, data_analyst_link, data_engineer_link, business_intelligence_link, machine_learning_link, mlops_link]

# Set up ChromeOptions to run in headless mode
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Get job offers url

In [None]:
# Use a set to store unique offer URLs
offer_urls = set()

for jobs_search_link in jobs_search_links:
    # Set up WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(jobs_search_link)

    # Accept cookies if prompted
    try:
        accept_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'hw-cc-notice-accept-btn'))
        )
        accept_button.click()
    except Exception as e:
        print("No cookie prompt found or unable to click accept button:", e)

    # Initialize variables for pagination
    page_number = 1

    # Iterate over pages until no new offers are found
    while True:
        # Wait for page content to load
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.tw-typo-s.tw-text-grey"))
            )
        except Exception as e:
            print(f"Timeout on page {page_number}: {e}")
            break

        # Extract job offer URLs
        date_elements = driver.find_elements(By.XPATH, "//div[@class='tw-typo-s tw-text-grey']")
        new_urls = 0
        for date_element in date_elements:
            parent_container = date_element.find_element(By.XPATH, "./ancestor::div[@data-cy='serpCard']")
            link_element = parent_container.find_element(By.XPATH, ".//a[@data-cy='offerTitle']")
            offer_url = link_element.get_attribute("href")

            # Add the URL to the set if it's new
            if offer_url not in offer_urls:
                offer_urls.add(offer_url)
                new_urls += 1

        print(f"Page {page_number}: Found {new_urls} new offers.")

        # If no new URLs are found, break the loop
        if new_urls == 0:
            print(f"No new offers found. Stopping pagination at page {page_number}.")
            break

        # Increment the page number and navigate to the next page
        page_number += 1
        next_page_url = f"{jobs_search_link}&p={page_number}"
        try:
            driver.get(next_page_url)
        except Exception as e:
            print(f"Failed to navigate to page {page_number}: {e}")
            break

    # Close the driver for this search link
    driver.quit()

# Convert the set back to a list (if order matters)
offer_urls = list(offer_urls)

print(f"Total unique job offers collected: {len(offer_urls)}")


# Get job offers text data
Get the full page text data to gather all the informations

In [None]:
import requests
from bs4 import BeautifulSoup

offers_data = {}
for offer_url in tqdm(offer_urls, desc="Fetching job offers"):
    try:
        response = requests.get(offer_url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "lxml")
        page_text = soup.get_text(separator="\n", strip=True)

        offers_data[offer_url] = page_text
    except Exception as e:
        print(f"Error fetching {offer_url}: {e}")


In [None]:
# Save the dictionary to a pickle file
with open("offers_data.pkl", "wb") as f:
    pickle.dump(offers_data, f)

print("Offers data saved to 'offers_data.pkl'")


In [None]:
len(offers_data)

In [None]:
with open("offers_data (2).pkl", "rb") as f:
    offers_data = pickle.load(f)

# Extract structured data

In [None]:
from pydantic import BaseModel, Field
from typing import List, Optional, Tuple
from openai import OpenAI
import re

# Define the job offer schema using Pydantic
class JobOffer(BaseModel):
    title: str
    company: str
    location: str
    department: Optional[str] = None
    contract_type: Optional[str]
    remote: Optional[str] = None
    sector: Optional[str] = None
    publication_date: Optional[str] = None
    experience_years: Optional[int] = Field(None, description="Years of experience required")
    education_level: Optional[int] = Field(None, description="Required education level (e.g., 5 for Bac+5)")
    salaire_min: Optional[float] = None
    salaire_max: Optional[float] = None
    annual: Optional[bool] = None
    tools: Optional[List[str]] = None
    concepts: Optional[List[str]] = None
    job_url: Optional[str] = None




# Function to query OpenAI API and process the response
def extract_job_info(job_description: str) -> JobOffer:
    client = OpenAI(api_key=openai_api_key)

    # Define the prompt to instruct the model
    messages = [
    {"role": "developer", "content": "You are a helpful assistant for extracting structured data."},
    {
        "role": "user",
        "content": f"""
        Extract structured information from the following job description. Ensure the output strictly matches the JSON format below and adheres to these rules:
        - **Title**: Remove gender terms like H/F.
        - **Company Name**: Exact name of the company.
        - **Location**: Provide only the city name, e.g., for 'Issy-les-Moulineaux - 92', return 'Issy-les-Moulineaux'.
        - **Department**: Provide only the department number, e.g., for 'Issy-les-Moulineaux - 92', return '92'.
        - **Contract Type**: As described in the text.
        - **Remote Work Details**: Return one of the following: 'Remote', 'Hybrid', or 'Onsite'.
          - 'Remote': If fully remote.
          - 'Hybrid': If partially remote.
          - 'Onsite': If no remote work is mentioned.
        - **Sector**: Provide one generic word describing the industry, e.g., 'Finance', 'Technology', 'Healthcare'.
        - **Publication Date**: Format as 'DD/MM/YYYY'.
        - **Years of Experience Required**: Numeric only, e.g., 2.
        - **Education Level Required**: Numeric only, e.g., 5 for Bac+5.
        - **Salary**: Split into three fields:
          - `salaire_min`: Minimum salary value (float). If not available, set null.
          - `salaire_max`: Maximum salary value (float). If not available, set null.
          - `annual`: 'yes' if the salary is annual, 'no' if monthly, or `null` if not specified.
        - **Tools**: List all mentioned technical tools. Do not attempt to fill the list with placeholders or null values if fewer than 5 tools are available. Return an empty list `[]` if no tools are mentioned.
        - **Concepts**: Provide a maximum of 5 generic concepts (e.g., MLOps, NLP, CI/CD, OCR, Generative AI). Do not attempt to fill the list with placeholders or null values if fewer than 5 concepts are available

        Job Description:
        {job_description}

        Expected JSON Output:
        {{
            "title": "<string>",
            "company": "<string>",
            "location": "<string>",
            "department": "<string>",
            "contract_type": "<string>",
            "remote": "<string>",
            "sector": "<string>",
            "publication_date": "<string>",
            "experience_years": <int | null>,
            "education_level": <int | null>,
            "salaire_min": <float | null>,
            "salaire_max": <float | null>,
            "annual": "<yes | no | null>",
            "tools": <null | ["<string>", "<string>", ...]>,
            "concepts": <null | ["<string>", "<string>", ...]>
        }}
        Ensure the JSON keys, types, and values are correct. If any field is unavailable, set it to `null`.
        Tools and concepts a should only list available tools and concepts without adding placeholders or nulls.
        """
    }
]


    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )

    # Parse the response
    structured_data = response.choices[0].message.content.strip("```json").strip("```").strip()
    # Convert the JSON string response into a Pydantic model
    job_offer = JobOffer.parse_raw(structured_data)

    return job_offer


In [None]:
# Some tests for debugging
import random
n = random.randint(0, len(offers_data))
job_description = list(offers_data.values())[n]
job_info = extract_job_info(job_description)
print(job_info)
print(list(offers_data.keys())[n])

# Main loop of data extraction

In [None]:
import pandas as pd
from tqdm import tqdm
import json

# Initialize a list to store job offers
job_offers_list = []

# Iterate over all texts in offers_data with a tqdm progress bar
for offer_url, job_description in tqdm(offers_data.items(), desc="Processing job descriptions"):
    try:
        # Extract job info from the text
        job_offer = extract_job_info(job_description)

        # Add the job URL to the job_offer dictionary
        job_offer_data = job_offer.dict()
        job_offer_data["job_url"] = offer_url

        # Append the job data to the list
        job_offers_list.append(job_offer_data)
    except Exception as e:
        print(f"Error processing URL {offer_url}: {e}")

# Convert the list of job offers to a pandas DataFrame
job_offers_df = pd.DataFrame(job_offers_list)

# Save the DataFrame to a CSV filevb
job_offers_df.to_csv("job_offers.csv", index=False, encoding="utf-8-sig")

# Print the first few rows of the DataFrame
print(job_offers_df.head())


In [None]:
job_offers_df

Unnamed: 0,title,company,location,department,contract_type,remote,sector,publication_date,experience_years,education_level,salaire_min,salaire_max,annual,tools,concepts,job_url
0,Contract Manager,Equans France,Guyancourt,78,CDI,Hybrid,BTP,16/12/2024,8.0,5.0,,,,[],,https://www.hellowork.com/fr-fr/emplois/595339...
1,Data Manager,S&You,La Talaudière,42,CDI,Onsite,Services,20/12/2024,7.0,4.0,60000.0,70000.0,True,"[Microsoft SSAS, SQL Server, Oracle, Power BI]","[Business Intelligence, ETL, KPI]",https://www.hellowork.com/fr-fr/emplois/584953...
2,Flying Technical Data Engineer - Avionics Syst...,Airbus,Toulouse,31,CDI,Hybrid,Aerospace,10/12/2024,0.0,5.0,,,,[],"[Aircraft Maintenance, Avionics Systems, Proje...",https://www.hellowork.com/fr-fr/emplois/593725...
3,Business Analyst Swift,Libellio,Paris,75,CDI,Hybrid,Finance,09/12/2024,7.0,5.0,55000.0,70000.0,True,"[Dataiku, PowerBI, JIRA]",,https://www.hellowork.com/fr-fr/emplois/581120...
4,Data Analyst Expérimenté,EY,La Défense,92,CDI,Hybrid,Technology,18/12/2024,,5.0,,,,"[Power BI, Tableau, SQL, R, Python]","[Data Analytics, Data Visualization, Big Data,...",https://www.hellowork.com/fr-fr/emplois/596462...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,Data Engineer,Ippon Technologies,Lille,59,CDI,Hybrid,Services,14/12/2024,,5.0,,,,"[Snowflake, Databricks, Matillion, DBT, Terraf...","[Data Engineering, Cloud, Big Data, Agile, Str...",https://www.hellowork.com/fr-fr/emplois/582732...
2796,Analyste Donnée Privée,Axa group operations,Paris,75,CDI,Hybrid,Finance,19/12/2024,5.0,5.0,,,,[],,https://www.hellowork.com/fr-fr/emplois/559700...
2797,Data Analyst Senior,Tessan,Paris,75,CDI,Hybrid,Healthcare,22/12/2024,5.0,5.0,,,,"[Tableau, Power BI, SQL, Python, R]","[Data Warehouse, Predictive Modeling, Statisti...",https://www.hellowork.com/fr-fr/emplois/586119...
2798,Business Analyst IT - Secteur Banque - Finance...,Econocom,Puteaux,92,CDI,Hybrid,Technology,25/11/2024,5.0,5.0,,,,[],,https://www.hellowork.com/fr-fr/emplois/587449...
