In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import time

[nltk_data] Downloading package punkt_tab to /Users/adri/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/adri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Importing keys

from dotenv import load_dotenv
load_dotenv()

import os
GEM_KEY = os.environ.get("GEM_KEY")
DEEP_KEY = os.environ.get("DEEP_APIKEY")
HF_API_TOKEN = os.environ.get("HF_API_TOKEN")

# Beautiful Soup!

In [6]:
### Rate limiting function.

# Function to scrape the job descriptions.
def remove_stopwords(text):
    """Remove stopwords from a given text using NLTK."""
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return ' '.join(filtered_tokens)

def scrape_jobs_with_descriptions(keywords, location, f_WT, pages_to_scrape, headers):
    keywords_encoded = quote(keywords)
    location_encoded = quote(location)
    jobs = []

    for page in range(pages_to_scrape):
        # LinkedIn URL for job search
        url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={keywords_encoded}&location={location_encoded}&f_WT={f_WT}&start={25 * page}"
        print(f"Scraping job list page: {url}")

        # Make a GET request
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page + 1}: {response.status_code}")
            continue

        # Parse the HTML
        soup = BeautifulSoup(response.content, "html.parser")
        divs = soup.find_all("div", class_="base-card")

        for div in divs:
            try:
                # Extract job title, company, location
                title = div.find("h3", class_="base-search-card__title").text.strip()
                company = div.find("h4", class_="base-search-card__subtitle").text.strip()
                location = div.find("span", class_="job-search-card__location").text.strip()

                # Extract the job URL from the <a> tag with the class "base-card__full-link"
                job_link_tag = div.find("a", class_="base-card__full-link")
                job_url = job_link_tag["href"] if job_link_tag else "No URL found"

                # Fetch the job description from the job URL
                job_description = fetch_job_description(job_url, headers) if job_url != "No URL found" else "No description available"
                job_description = remove_stopwords(job_description)

                # Add job details to the list
                jobs.append({
                    "title": title,
                    "company": company,
                    "location": location,
                    "url": job_url,
                    "description": job_description
                })
            except Exception as e:
                print(f"Error parsing job: {e}")

        ### RATE LIMIT ADDED: Pause for 2 seconds after processing each page
        time.sleep(2)

    return jobs

def fetch_job_description(job_url, headers):
    """Fetch job description from individual job posting."""
    try:
        response = requests.get(job_url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch job page: {job_url}")
            return "Failed to fetch job description"
        soup = BeautifulSoup(response.content, "html.parser")
        description_div = soup.find("div", class_="show-more-less-html__markup")
        if description_div:
            return description_div.get_text(strip=True).replace("\n", " ")
        return "No description available"
    except Exception as e:
        print(f"Error fetching job description: {e}")
        return "Error fetching job description"

# Configuration
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
keywords = "Data Scientist"
location = "London"
f_WT = "2"  # Remote jobs
pages_to_scrape = 1  # Number of pages to scrape

# Run the scraper and store the results in a list of dictionaries. Each entry includes:
jobs_with_descriptions = scrape_jobs_with_descriptions(keywords, location, f_WT, pages_to_scrape, headers)

# Print job details
for job in jobs_with_descriptions:
    print(job)

Scraping job list page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%20Scientist&location=London&f_WT=2&start=0
{'title': 'Data Scientist', 'company': 'Prolific', 'location': 'London, England, United Kingdom', 'url': 'https://uk.linkedin.com/jobs/view/data-scientist-at-prolific-4179910842?position=1&pageNum=0&refId=xHXMNW9Pvvzf%2FgSYQUOBIA%3D%3D&trackingId=CUYRKFx9XKvn4FjBiAZ7TA%3D%3D', 'description': "Data TeamProlificProlific another player AI space – architects human data infrastructure 's reshaping landscape AI development . world foundational AI technologies increasingly commoditized , 's quality diversity human-generated data truly differentiates products models.The roleWe 're looking Data Scientist strong analytical skills passion solving complex problems join team . 'll work cross-functionally product engineering teams , driving initiatives unlock power vast datasets . 'll significant autonomy design , build , deploy models , develop meas

In [None]:
### Original.

# Function to scrape the job descriptions.

def remove_stopwords(text):
    """Remove stopwords from a given text using NLTK."""
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return ' '.join(filtered_tokens)

def scrape_jobs_with_descriptions(keywords, location, f_WT, pages_to_scrape, headers):
    keywords_encoded = quote(keywords)
    location_encoded = quote(location)
    jobs = []

    for page in range(pages_to_scrape):
        # LinkedIn URL for job search
        url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={keywords_encoded}&location={location_encoded}&f_WT={f_WT}&start={25 * page}"
        print(f"Scraping job list page: {url}")

        # Make a GET request
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page + 1}: {response.status_code}")
            continue

        # Parse the HTML
        soup = BeautifulSoup(response.content, "html.parser")
        divs = soup.find_all("div", class_="base-card")

        for div in divs:
            try:
                # Extract job title, company, location
                title = div.find("h3", class_="base-search-card__title").text.strip()
                company = div.find("h4", class_="base-search-card__subtitle").text.strip()
                location = div.find("span", class_="job-search-card__location").text.strip()

                # Extract the job URL from the <a> tag with the class "base-card__full-link"
                job_link_tag = div.find("a", class_="base-card__full-link")
                job_url = job_link_tag["href"] if job_link_tag else "No URL found"

                # Fetch the job description from the job URL
                job_description = fetch_job_description(job_url, headers) if job_url != "No URL found" else "No description available"
                job_description = remove_stopwords(job_description)


                # Add job details to the list
                jobs.append({
                    "title": title,
                    "company": company,
                    "location": location,
                    "url": job_url,
                    "description": job_description
                })
            except Exception as e:
                print(f"Error parsing job: {e}")

    return jobs

def fetch_job_description(job_url, headers):
    """Fetch job description from individual job posting."""
    try:
        response = requests.get(job_url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch job page: {job_url}")
            return "Failed to fetch job description"
        soup = BeautifulSoup(response.content, "html.parser")
        description_div = soup.find("div", class_="show-more-less-html__markup")
        if description_div:
            return description_div.get_text(strip=True).replace("\n", " ")
        return "No description available"
    except Exception as e:
        print(f"Error fetching job description: {e}")
        return "Error fetching job description"

# Configuration
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
keywords = "Data Scientist"
location = "London"
f_WT = "2"  # Remote jobs
pages_to_scrape = 20  # Number of pages to scrape

# Run the scraper and stores it in a list of dictionaries. Each entry includes:
jobs_with_descriptions = scrape_jobs_with_descriptions(keywords, location, f_WT, pages_to_scrape, headers)

# Print job details
for job in jobs_with_descriptions:
    print(job)

Scraping job list page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%20Scientist&location=London&f_WT=2&start=0
Scraping job list page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%20Scientist&location=London&f_WT=2&start=25
Failed to fetch job page: https://uk.linkedin.com/jobs/view/principal-decision-scientist-applied-optimization-and-simulation-2025-uk-at-aimpoint-digital-4162470234?position=1&pageNum=2&refId=z%2BOXi99qogCAYKv71HPl%2Bg%3D%3D&trackingId=rIkMByoXGsmsQAL4MlDIrA%3D%3D
Failed to fetch job page: https://uk.linkedin.com/jobs/view/ai-developer-amp-pound-50-000-remote-at-tenth-revolution-group-4182366827?position=2&pageNum=2&refId=z%2BOXi99qogCAYKv71HPl%2Bg%3D%3D&trackingId=tfBePxkOMjWCyyrklBiZFQ%3D%3D
Failed to fetch job page: https://uk.linkedin.com/jobs/view/ai-software-engineer-at-siena-ai-4051717334?position=3&pageNum=2&refId=z%2BOXi99qogCAYKv71HPl%2Bg%3D%3D&trackingId=qeY9TV4kECdhUxg1ghb%2B

KeyboardInterrupt: 

In [7]:
len(word_tokenize((jobs_with_descriptions[0]['description'])))

430

In [8]:
len(jobs_with_descriptions)

10

In [9]:
from IPython.display import display, HTML

# Display the first job description in a styled block
description = jobs_with_descriptions[0]['description']
html_block = f"""
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; margin: 10px 0;
            font-family: Arial, sans-serif; line-height: 1.6; white-space: pre-wrap; background-color: #f9f9f9;">
    <h3 style="margin-top: 0; font-size: 16px; color: #333;">Job Description</h3>
    <p style="margin: 0;">{description}</p>
</div>
"""

display(HTML(html_block))

# Gemini

In [10]:
jobs_with_descriptions[0]['description']

"Data TeamProlificProlific another player AI space – architects human data infrastructure 's reshaping landscape AI development . world foundational AI technologies increasingly commoditized , 's quality diversity human-generated data truly differentiates products models.The roleWe 're looking Data Scientist strong analytical skills passion solving complex problems join team . 'll work cross-functionally product engineering teams , driving initiatives unlock power vast datasets . 'll significant autonomy design , build , deploy models , develop measurement frameworks , help influence decisions directly impact platform 's capabilities business strategy . data function supports multiple areas business , giving exposure diverse challenges allowing develop deep expertise specific domains based interests business priorities.What 'll bring roleExperience interest working human behavioral data , annotation/labeling systems , projects involving human feedback AI development evaluationExperienc

In [11]:
# let's try it with one job description

from google import genai

client = genai.Client(api_key=GEM_KEY)

# prompt = (
#     f"Extract the relevant hard skills and soft skills from the following job description: {jobs_with_descriptions[0]['description']}. "
#     "Return only a valid JSON object with exactly two keys: 'hard_skills' and 'soft_skills', where each key maps to an array of strings. "
#     "Ensure you only output the JSON object in the following format (example): {{\"hard_skills\": [\"skill1\", \"skill2\"], \"soft_skills\": [\"skillA\", \"skillB\"]}}. "
#     "Do not include any markdown formatting, triple backticks, or any extra text. Only output the JSON object."
# )

prompt = f'''Extract the relevant hard skills and soft skills from the following job description. For hard skills, make sure to include programming languages, libraries, technologies mentioned.
            Return only a valid JSON object with exactly two keys: 'hard_skills' and 'soft_skills', where each key maps to an array of strings. Ensure you only output the JSON object in the following format (example): {{\"hard_skills\": [\"skill1\", \"skill2\"], \"soft_skills\": [\"skillA\", \"skillB\"]}}. Do not include any markdown formatting, triple backticks, or any extra text. Only output the JSON object.)
            # Description: {jobs_with_descriptions[0]['description']}'''

In [12]:
# Available models:
# model='gemini-2.0-flash-lite-preview-02-05',

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prompt)

response.text

'```json\n{\n  "hard_skills": [\n    "Python",\n    "R",\n    "SQL",\n    "AI/ML frameworks",\n    "modern data science stack",\n    "MLOps",\n    "experimental design",\n    "causal inference methods",\n    "statistical methods"\n  ],\n  "soft_skills": [\n    "analytical skills",\n    "communication",\n    "influence",\n    "prioritization",\n    "problem-solving",\n    "collaboration"\n  ]\n}\n```'

In [13]:
from google import genai
import json
from collections import Counter
import pandas as pd
import plotly.express as px

# Initialize the Gemini client
client = genai.Client(api_key=GEM_KEY)

def clean_json_output(response_text):
    """
    Remove markdown formatting (triple backticks and '```json' label)
    and return a clean JSON string.
    """
    response_text = response_text.strip()
    if response_text.startswith("```json"):
        response_text = response_text[len("```json"):].strip()
    if response_text.endswith("```"):
        response_text = response_text[:-len("```")].strip()
    return response_text

def batch_jobs(jobs, batch_size):
    """Yield successive batches from the jobs list."""
    for i in range(0, len(jobs), batch_size):
        yield jobs[i:i + batch_size]

extracted_skills = []

# ----- UPDATED CODE: Batch Size Calculation -----
# Assuming pages_to_scrape is 10 (as defined earlier) and jobs are evenly distributed:
jobs_per_page = len(jobs_with_descriptions) // pages_to_scrape
batch_size = jobs_per_page * 2  # Two pages per batch
# ----- END OF UPDATED CODE -----

for batch in batch_jobs(jobs_with_descriptions, batch_size):
    # Combine the job descriptions using a clear delimiter.
    descriptions = "\n---\n".join(job['description'] for job in batch)
    prompt = (
        "Below are several job descriptions separated by '---'. "
        "For each job description, extract the relevant hard skills and soft skills. "
        "For hard skills, include programming languages, libraries, and technologies mentioned. "
        "Return a JSON array where each element is an object with exactly two keys: "
        "'hard_skills' and 'soft_skills', mapping to arrays of strings. "
        "Only output the JSON array with no extra text or markdown formatting.\n\n" +
        descriptions
    )

    response = client.models.generate_content(
        # model='gemini-2.0-flash-lite-preview-02-05',
        model="gemini-2.0-flash",
        contents=prompt
    )

    cleaned = clean_json_output(response.text)

    try:
        # Expecting a JSON array with one element per job description in the batch
        batch_parsed = json.loads(cleaned)

        # Optional: Check if the parsed array has the same number of elements as the batch.
        if not isinstance(batch_parsed, list) or len(batch_parsed) != len(batch):
            print("Warning: The parsed output count does not match the number of job descriptions in the batch.")

        # Normalize skills to lowercase for consistency.
        for job_skills in batch_parsed:
            if "hard_skills" in job_skills:
                job_skills["hard_skills"] = [skill.lower() for skill in job_skills["hard_skills"]]
            if "soft_skills" in job_skills:
                job_skills["soft_skills"] = [skill.lower() for skill in job_skills["soft_skills"]]
        extracted_skills.extend(batch_parsed)
    except Exception as e:
        print("Error parsing batched JSON:", e)

# Aggregate the skills across all processed job descriptions.
hard_skills_counter = Counter()
soft_skills_counter = Counter()

for skills in extracted_skills:
    if "hard_skills" in skills:
        hard_skills_counter.update(skills["hard_skills"])
    if "soft_skills" in skills:
        soft_skills_counter.update(skills["soft_skills"])

# Create DataFrames for visualization.
df_hard = pd.DataFrame(hard_skills_counter.items(), columns=["Skill", "Frequency"])
df_soft = pd.DataFrame(soft_skills_counter.items(), columns=["Skill", "Frequency"])

print("Aggregated Hard Skills:")
print(df_hard)
print("\nAggregated Soft Skills:")
print(df_soft)


Aggregated Hard Skills:
                             Skill  Frequency
0                           python          6
1                                r          2
2                              sql          3
3                 ai/ml frameworks          1
4        modern data science stack          1
5    classical statistical methods          1
6   state-of-the-art ml techniques          1
7                            mlops          1
8                       statistics          1
9                     data science          1
10             operations research          1
11          statistical techniques          1
12     machine learning techniques          2
13                      clustering          2
14                      regression          1
15                  decision trees          1
16                      databricks          1
17                           spark          1
18                         pyspark          1
19                          pandas          2
20        

In [14]:
# Visualization with Plotly.
df_hard_sorted = df_hard.sort_values("Frequency", ascending=False)
df_soft_sorted = df_soft.sort_values("Frequency", ascending=False)

fig_hard = px.bar(
    df_hard_sorted,
    x="Skill",
    y="Frequency",
    title="Top Hard Skills",
    labels={"Skill": "Hard Skill", "Frequency": "Count"},
    color="Frequency",
    color_continuous_scale="Blues"
)
fig_hard.update_layout(xaxis_tickangle=-45)
fig_hard.show()

fig_soft = px.bar(
    df_soft_sorted,
    x="Skill",
    y="Frequency",
    title="Top Soft Skills",
    labels={"Skill": "Soft Skill", "Frequency": "Count"},
    color="Frequency",
    color_continuous_scale="Blues"
)
fig_soft.update_layout(xaxis_tickangle=-45)
fig_soft.show()

In [15]:
df_hard_sorted

Unnamed: 0,Skill,Frequency
0,python,6
2,sql,3
33,tensorflow,3
34,pytorch,3
13,clustering,2
1,r,2
20,numpy,2
19,pandas,2
12,machine learning techniques,2
37,artificial neural networks,1


In [16]:
df_hard_sorted[df_hard_sorted['Skill'].str.contains('aws', case=False, na=False)]

Unnamed: 0,Skill,Frequency


In [17]:
df_soft_sorted

Unnamed: 0,Skill,Frequency
1,communication,9
4,problem-solving,5
10,collaboration,5
0,analytical skills,2
22,writing,2
21,math,2
15,leadership,1
24,adaptability,1
23,strategic mindset,1
20,time management,1


# Qwen

# DeepSeek

In [18]:
# Let's try DeepSeek, especially it's output.

from openai import OpenAI

client = OpenAI(api_key=DEEP_KEY, base_url="https://api.deepseek.com")

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Hello"},
    ],
    stream=False
)

print(response.choices[0].message.content)

# API not working at the moment...

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

# OLD BELOW

In [None]:
# Trying the API!

import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
from huggingface_hub.inference_api import InferenceApi

# Set your Hugging Face API token and choose a model repository
MODEL_REPO_ID = 'google/flan-t5-base'

# Initialize the Inference API client
inference = InferenceApi(repo_id=MODEL_REPO_ID, token=HF_API_TOKEN)

# Function to scrape job postings with descriptions
def scrape_jobs_with_descriptions(keywords, location, f_WT, pages_to_scrape, headers):
    keywords_encoded = quote(keywords)
    location_encoded = quote(location)
    jobs = []

    for page in range(pages_to_scrape):
        url = (
            f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/"
            f"search?keywords={keywords_encoded}&location={location_encoded}&f_WT={f_WT}&start={25 * page}"
        )
        print(f"Scraping job list page: {url}")

        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page + 1}: {response.status_code}")
            continue

        soup = BeautifulSoup(response.content, "html.parser")
        divs = soup.find_all("div", class_="base-card")

        for div in divs:
            try:
                title = div.find("h3", class_="base-search-card__title").text.strip()
                company = div.find("h4", class_="base-search-card__subtitle").text.strip()
                location = div.find("span", class_="job-search-card__location").text.strip()
                job_link_tag = div.find("a", class_="base-card__full-link")
                job_url = job_link_tag["href"] if job_link_tag else "No URL found"
                job_description = (
                    fetch_job_description(job_url, headers)
                    if job_url != "No URL found"
                    else "No description available"
                )
                jobs.append({
                    "title": title,
                    "company": company,
                    "location": location,
                    "url": job_url,
                    "description": job_description
                })
            except Exception as e:
                print(f"Error parsing job: {e}")
    return jobs

# Function to fetch the job description from the individual job posting
def fetch_job_description(job_url, headers):
    try:
        response = requests.get(job_url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch job page: {job_url}")
            return "Failed to fetch job description"
        soup = BeautifulSoup(response.content, "html.parser")
        description_div = soup.find("div", class_="show-more-less-html__markup")
        if description_div:
            return description_div.get_text(strip=True).replace("\n", " ")
        return "No description available"
    except Exception as e:
        print(f"Error fetching job description: {e}")
        return "Error fetching job description"

# Function to extract skills using the Hugging Face Inference API client wrapper
def extract_skills(job_description):
    prompt = (
        "Extract all relevant hard and soft skills from the following job description. "
        "Return the results as JSON with keys 'hard_skills' and 'soft_skills'.\n\n"
        f"Job Description: {job_description}"
    )
    response = inference(prompt)
    return response

# Configuration for scraping
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/91.0.4472.124 Safari/537.36"
}
keywords = "software engineer"
location = "Remote"
f_WT = "2"  # Remote jobs
pages_to_scrape = 1  # Adjust as needed

# Scrape job postings
jobs_with_descriptions = scrape_jobs_with_descriptions(keywords, location, f_WT, pages_to_scrape, headers)

# Extract skills for each job description using the client wrapper
for job in jobs_with_descriptions:
    skills = extract_skills(job["description"])
    job["extracted_skills"] = skills

# Print job details with the extracted skills
for job in jobs_with_descriptions:
    print(job)


Scraping job list page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=software%20engineer&location=Remote&f_WT=2&start=0
{'title': 'Software Engineer', 'company': 'Microsoft', 'location': 'Bengaluru, Karnataka, India', 'url': 'https://in.linkedin.com/jobs/view/software-engineer-at-microsoft-4131147250?position=1&pageNum=0&refId=gwJNwjrnv3xMXSUwJOU0kg%3D%3D&trackingId=25mdpDh2rGiqMrW%2BLBFqXQ%3D%3D', 'description': "Have you ever imagined a world with an infinite amount of storage available and accessible to everyone? A place where everyone in the world can easily access their data from anywhere at any time via any means (e.g., mobile phones, tablets, PCs, smart devices, etc.). Did you ever desire a universally accessible storage system to record all the knowledge known to mankind or to store all the data collected from all the scientists in the world for them to collaborate upon? Do you want to be part of a team that strives to bring these to reality?A