In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt_tab to /Users/adri/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/adri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from dotenv import load_dotenv
load_dotenv()  # Loads variables from .env into os.environ

import os
GEM_KEY = os.environ.get("GEM_KEY")
DEEP_KEY = os.environ.get("DEEP_APIKEY")
HF_API_TOKEN = os.environ.get("HF_API_TOKEN")

# Beautiful Soup!

In [4]:
# Function to scrape the job descriptions.

def remove_stopwords(text):
    """Remove stopwords from a given text using NLTK."""
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return ' '.join(filtered_tokens)

def scrape_jobs_with_descriptions(keywords, location, f_WT, pages_to_scrape, headers):
    keywords_encoded = quote(keywords)
    location_encoded = quote(location)
    jobs = []

    for page in range(pages_to_scrape):
        # LinkedIn URL for job search
        url = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={keywords_encoded}&location={location_encoded}&f_WT={f_WT}&start={25 * page}"
        print(f"Scraping job list page: {url}")

        # Make a GET request
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page + 1}: {response.status_code}")
            continue

        # Parse the HTML
        soup = BeautifulSoup(response.content, "html.parser")
        divs = soup.find_all("div", class_="base-card")

        for div in divs:
            try:
                # Extract job title, company, location
                title = div.find("h3", class_="base-search-card__title").text.strip()
                company = div.find("h4", class_="base-search-card__subtitle").text.strip()
                location = div.find("span", class_="job-search-card__location").text.strip()

                # Extract the job URL from the <a> tag with the class "base-card__full-link"
                job_link_tag = div.find("a", class_="base-card__full-link")
                job_url = job_link_tag["href"] if job_link_tag else "No URL found"

                # Fetch the job description from the job URL
                job_description = fetch_job_description(job_url, headers) if job_url != "No URL found" else "No description available"
                job_description = remove_stopwords(job_description)


                # Add job details to the list
                jobs.append({
                    "title": title,
                    "company": company,
                    "location": location,
                    "url": job_url,
                    "description": job_description
                })
            except Exception as e:
                print(f"Error parsing job: {e}")

    return jobs

def fetch_job_description(job_url, headers):
    """Fetch job description from individual job posting."""
    try:
        response = requests.get(job_url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch job page: {job_url}")
            return "Failed to fetch job description"
        soup = BeautifulSoup(response.content, "html.parser")
        description_div = soup.find("div", class_="show-more-less-html__markup")
        if description_div:
            return description_div.get_text(strip=True).replace("\n", " ")
        return "No description available"
    except Exception as e:
        print(f"Error fetching job description: {e}")
        return "Error fetching job description"

# Configuration
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
keywords = "Data Scientist"
location = "Amsterdam"
f_WT = "2"  # Remote jobs
pages_to_scrape = 1  # Number of pages to scrape

# Run the scraper and stores it in a list of dictionaries. Each entry includes:
jobs_with_descriptions = scrape_jobs_with_descriptions(keywords, location, f_WT, pages_to_scrape, headers)

# Print job details
for job in jobs_with_descriptions:
    print(job)

Scraping job list page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%20Scientist&location=Amsterdam&f_WT=2&start=0
{'title': 'React Developer (Freelance)', 'company': 'DevologyX', 'location': 'Amsterdam, North Holland, Netherlands', 'url': 'https://nl.linkedin.com/jobs/view/react-developer-freelance-at-devologyx-4172257107?position=1&pageNum=0&refId=2BDmXHbzFjXh27mP9UVA%2FQ%3D%3D&trackingId=xGv5h47UuV6y%2B5Ehnc9wPw%3D%3D', 'description': '12 Month Remote Contract , Possible Extension - Must EU Citizen , Occassional site visits Amsterdam ( Quarterly ) .We looking talented React Developer join client ’ insurance tech team.You responsible designing building high-performance , user-friendly web applications help insurers streamline claims processing , policy management , customer interactions.You ’ work alongside backend developers UX designers create seamless digital experiences insurance space.RESPONSIBILITIESDevelop responsive dynamic front-end ap

In [5]:
len(word_tokenize((jobs_with_descriptions[0]['description'])))

147

In [6]:
len(jobs_with_descriptions)

10

In [7]:
from IPython.display import display, HTML

# Display the first job description in a styled block
description = jobs_with_descriptions[0]['description']
html_block = f"""
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; margin: 10px 0;
            font-family: Arial, sans-serif; line-height: 1.6; white-space: pre-wrap; background-color: #f9f9f9;">
    <h3 style="margin-top: 0; font-size: 16px; color: #333;">Job Description</h3>
    <p style="margin: 0;">{description}</p>
</div>
"""

display(HTML(html_block))

# Gemini

In [8]:
jobs_with_descriptions[0]['description']

'12 Month Remote Contract , Possible Extension - Must EU Citizen , Occassional site visits Amsterdam ( Quarterly ) .We looking talented React Developer join client ’ insurance tech team.You responsible designing building high-performance , user-friendly web applications help insurers streamline claims processing , policy management , customer interactions.You ’ work alongside backend developers UX designers create seamless digital experiences insurance space.RESPONSIBILITIESDevelop responsive dynamic front-end applications using React.js.Collaborate UI/UX designers implement visually appealing interfaces.Optimize components maximum performance across devices browsers.Integrate frontend applications backend services using REST GraphQL APIs.Maintain high code quality unit testing code reviews.Work state management libraries Redux Zustand.REQUIREMENTSStrong experience React.js JavaScript/TypeScript.Proficiency HTML , CSS , responsive design principles.Experience integrating APIs using Axi

In [9]:
# let's try it with one job description

from google import genai

client = genai.Client(api_key=GEM_KEY)

# prompt = (
#     f"Extract the relevant hard skills and soft skills from the following job description: {jobs_with_descriptions[0]['description']}. "
#     "Return only a valid JSON object with exactly two keys: 'hard_skills' and 'soft_skills', where each key maps to an array of strings. "
#     "Ensure you only output the JSON object in the following format (example): {{\"hard_skills\": [\"skill1\", \"skill2\"], \"soft_skills\": [\"skillA\", \"skillB\"]}}. "
#     "Do not include any markdown formatting, triple backticks, or any extra text. Only output the JSON object."
# )

prompt = f'''Extract the relevant hard skills and soft skills from the following job description. For hard skills, make sure to include programming languages, libraries, technologies mentioned.
            Return only a valid JSON object with exactly two keys: 'hard_skills' and 'soft_skills', where each key maps to an array of strings. Ensure you only output the JSON object in the following format (example): {{\"hard_skills\": [\"skill1\", \"skill2\"], \"soft_skills\": [\"skillA\", \"skillB\"]}}. Do not include any markdown formatting, triple backticks, or any extra text. Only output the JSON object.)
            # Description: {jobs_with_descriptions[0]['description']}'''

In [10]:
# Available models:
# model='gemini-2.0-flash-lite-preview-02-05',

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prompt)

response.text

'{"hard_skills": ["React.js", "JavaScript", "TypeScript", "HTML", "CSS", "REST APIs", "GraphQL", "Redux", "Zustand", "Webpack", "Vite", "Axios", "Fetch", "Git"], "soft_skills": ["Collaboration", "Communication", "Problem-solving"]}\n'

In [12]:
from google import genai
import json
from collections import Counter
import pandas as pd
import plotly.express as px

# Initialize the Gemini client
client = genai.Client(api_key=GEM_KEY)

def clean_json_output(response_text):
    """
    Remove markdown formatting (triple backticks and '```json' label)
    and return a clean JSON string.
    """
    response_text = response_text.strip()
    if response_text.startswith("```json"):
        response_text = response_text[len("```json"):].strip()
    if response_text.endswith("```"):
        response_text = response_text[:-len("```")].strip()
    return response_text

def batch_jobs(jobs, batch_size):
    """Yield successive batches from the jobs list."""
    for i in range(0, len(jobs), batch_size):
        yield jobs[i:i + batch_size]

extracted_skills = []

# ----- UPDATED CODE: Batch Size Calculation -----
# Assuming pages_to_scrape is 10 (as defined earlier) and jobs are evenly distributed:
jobs_per_page = len(jobs_with_descriptions) // pages_to_scrape
batch_size = jobs_per_page * 2  # Two pages per batch
# ----- END OF UPDATED CODE -----

for batch in batch_jobs(jobs_with_descriptions, batch_size):
    # Combine the job descriptions using a clear delimiter.
    descriptions = "\n---\n".join(job['description'] for job in batch)
    prompt = (
        "Below are several job descriptions separated by '---'. "
        "For each job description, extract the relevant hard skills and soft skills. "
        "For hard skills, include programming languages, libraries, and technologies mentioned. "
        "Return a JSON array where each element is an object with exactly two keys: "
        "'hard_skills' and 'soft_skills', mapping to arrays of strings. "
        "Only output the JSON array with no extra text or markdown formatting.\n\n" +
        descriptions
    )

    response = client.models.generate_content(
        # model='gemini-2.0-flash-lite-preview-02-05',
        model="gemini-2.0-flash",
        contents=prompt
    )

    cleaned = clean_json_output(response.text)

    try:
        # Expecting a JSON array with one element per job description in the batch
        batch_parsed = json.loads(cleaned)

        # Optional: Check if the parsed array has the same number of elements as the batch.
        if not isinstance(batch_parsed, list) or len(batch_parsed) != len(batch):
            print("Warning: The parsed output count does not match the number of job descriptions in the batch.")

        # Normalize skills to lowercase for consistency.
        for job_skills in batch_parsed:
            if "hard_skills" in job_skills:
                job_skills["hard_skills"] = [skill.lower() for skill in job_skills["hard_skills"]]
            if "soft_skills" in job_skills:
                job_skills["soft_skills"] = [skill.lower() for skill in job_skills["soft_skills"]]
        extracted_skills.extend(batch_parsed)
    except Exception as e:
        print("Error parsing batched JSON:", e)

# Aggregate the skills across all processed job descriptions.
hard_skills_counter = Counter()
soft_skills_counter = Counter()

for skills in extracted_skills:
    if "hard_skills" in skills:
        hard_skills_counter.update(skills["hard_skills"])
    if "soft_skills" in skills:
        soft_skills_counter.update(skills["soft_skills"])

# Create DataFrames for visualization.
df_hard = pd.DataFrame(hard_skills_counter.items(), columns=["Skill", "Frequency"])
df_soft = pd.DataFrame(soft_skills_counter.items(), columns=["Skill", "Frequency"])

print("Aggregated Hard Skills:")
print(df_hard)
print("\nAggregated Soft Skills:")
print(df_soft)


Aggregated Hard Skills:
           Skill  Frequency
0       react.js          1
1     javascript          2
2     typescript          1
3           html          1
4            css          1
..           ...        ...
68    clickhouse          1
69         linux          1
70           lxd          1
71         azure          1
72  google cloud          1

[73 rows x 2 columns]

Aggregated Soft Skills:
                         Skill  Frequency
0            responsive design          1
1          agile methodologies          1
2                collaboration          7
3                communication          9
4              problem-solving          6
5                     teamwork          3
6                 adaptability          3
7                   innovation          2
8              framework usage          1
9                        agile          1
10  microservices architecture          1
11                  pragmatism          1
12                   ownership          2
13  

In [13]:
# Visualization with Plotly.
df_hard_sorted = df_hard.sort_values("Frequency", ascending=False)
df_soft_sorted = df_soft.sort_values("Frequency", ascending=False)

fig_hard = px.bar(
    df_hard_sorted,
    x="Skill",
    y="Frequency",
    title="Top Hard Skills",
    labels={"Skill": "Hard Skill", "Frequency": "Count"},
    color="Frequency",
    color_continuous_scale="Blues"
)
fig_hard.update_layout(xaxis_tickangle=-45)
fig_hard.show()

fig_soft = px.bar(
    df_soft_sorted,
    x="Skill",
    y="Frequency",
    title="Top Soft Skills",
    labels={"Skill": "Soft Skill", "Frequency": "Count"},
    color="Frequency",
    color_continuous_scale="Blues"
)
fig_soft.update_layout(xaxis_tickangle=-45)
fig_soft.show()

In [14]:
df_hard_sorted

Unnamed: 0,Skill,Frequency
14,python,6
33,machine learning,4
12,llms,3
18,aws,3
24,postgresql,2
...,...,...
35,machine learning models,1
37,information retrieval,1
38,text classification,1
39,aws sagemaker,1


In [15]:
df_soft_sorted

Unnamed: 0,Skill,Frequency
3,communication,9
2,collaboration,7
4,problem-solving,6
5,teamwork,3
6,adaptability,3
7,innovation,2
12,ownership,2
17,autonomous,2
0,responsive design,1
30,feedback,1


# Qwen

# DeepSeek

In [16]:
# Let's try DeepSeek, especially it's output.

from openai import OpenAI

client = OpenAI(api_key=DEEP_KEY, base_url="https://api.deepseek.com")

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Hello"},
    ],
    stream=False
)

print(response.choices[0].message.content)

# API not working at the moment...

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

# OLD BELOW

In [None]:
# Trying the API!

import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
from huggingface_hub.inference_api import InferenceApi

# Set your Hugging Face API token and choose a model repository
MODEL_REPO_ID = 'google/flan-t5-base'

# Initialize the Inference API client
inference = InferenceApi(repo_id=MODEL_REPO_ID, token=HF_API_TOKEN)

# Function to scrape job postings with descriptions
def scrape_jobs_with_descriptions(keywords, location, f_WT, pages_to_scrape, headers):
    keywords_encoded = quote(keywords)
    location_encoded = quote(location)
    jobs = []

    for page in range(pages_to_scrape):
        url = (
            f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/"
            f"search?keywords={keywords_encoded}&location={location_encoded}&f_WT={f_WT}&start={25 * page}"
        )
        print(f"Scraping job list page: {url}")

        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page + 1}: {response.status_code}")
            continue

        soup = BeautifulSoup(response.content, "html.parser")
        divs = soup.find_all("div", class_="base-card")

        for div in divs:
            try:
                title = div.find("h3", class_="base-search-card__title").text.strip()
                company = div.find("h4", class_="base-search-card__subtitle").text.strip()
                location = div.find("span", class_="job-search-card__location").text.strip()
                job_link_tag = div.find("a", class_="base-card__full-link")
                job_url = job_link_tag["href"] if job_link_tag else "No URL found"
                job_description = (
                    fetch_job_description(job_url, headers)
                    if job_url != "No URL found"
                    else "No description available"
                )
                jobs.append({
                    "title": title,
                    "company": company,
                    "location": location,
                    "url": job_url,
                    "description": job_description
                })
            except Exception as e:
                print(f"Error parsing job: {e}")
    return jobs

# Function to fetch the job description from the individual job posting
def fetch_job_description(job_url, headers):
    try:
        response = requests.get(job_url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch job page: {job_url}")
            return "Failed to fetch job description"
        soup = BeautifulSoup(response.content, "html.parser")
        description_div = soup.find("div", class_="show-more-less-html__markup")
        if description_div:
            return description_div.get_text(strip=True).replace("\n", " ")
        return "No description available"
    except Exception as e:
        print(f"Error fetching job description: {e}")
        return "Error fetching job description"

# Function to extract skills using the Hugging Face Inference API client wrapper
def extract_skills(job_description):
    prompt = (
        "Extract all relevant hard and soft skills from the following job description. "
        "Return the results as JSON with keys 'hard_skills' and 'soft_skills'.\n\n"
        f"Job Description: {job_description}"
    )
    response = inference(prompt)
    return response

# Configuration for scraping
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/91.0.4472.124 Safari/537.36"
}
keywords = "software engineer"
location = "Remote"
f_WT = "2"  # Remote jobs
pages_to_scrape = 1  # Adjust as needed

# Scrape job postings
jobs_with_descriptions = scrape_jobs_with_descriptions(keywords, location, f_WT, pages_to_scrape, headers)

# Extract skills for each job description using the client wrapper
for job in jobs_with_descriptions:
    skills = extract_skills(job["description"])
    job["extracted_skills"] = skills

# Print job details with the extracted skills
for job in jobs_with_descriptions:
    print(job)


Scraping job list page: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=software%20engineer&location=Remote&f_WT=2&start=0
{'title': 'Software Engineer', 'company': 'Microsoft', 'location': 'Bengaluru, Karnataka, India', 'url': 'https://in.linkedin.com/jobs/view/software-engineer-at-microsoft-4131147250?position=1&pageNum=0&refId=gwJNwjrnv3xMXSUwJOU0kg%3D%3D&trackingId=25mdpDh2rGiqMrW%2BLBFqXQ%3D%3D', 'description': "Have you ever imagined a world with an infinite amount of storage available and accessible to everyone? A place where everyone in the world can easily access their data from anywhere at any time via any means (e.g., mobile phones, tablets, PCs, smart devices, etc.). Did you ever desire a universally accessible storage system to record all the knowledge known to mankind or to store all the data collected from all the scientists in the world for them to collaborate upon? Do you want to be part of a team that strives to bring these to reality?A