# Scraping AI Job Board with Python
## ABB #6 - Session 1

Code authored by: Shaw Talebi

### imports

In [1]:
import requests
from bs4 import BeautifulSoup
import json, json5
import pandas as pd

### 1) extract job listing links

In [2]:
# URL of the website
job_board_url = "https://aijobs.ai/united-states"

# Send a GET request to the website
response = requests.get(job_board_url)

# Check if the request was successful
if response.status_code == 200:
    # Get the HTML content
    html_content = response.text
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [3]:
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

In [4]:
# select all job cards on page i.e. <a> elements with class jobcardStyle1 that also have an href attribute
job_card_list = soup.select("a.jobcardStyle1[href]")

# grab all job urls on page
job_url_list = []
for job_card in job_card_list:
    job_url = job_card["href"]
    job_url_list.append(job_url)

In [5]:
len(job_url_list)

26

#### repeat for 9 pages

In [6]:
def get_page_soup(url):

    # Send a GET request to the website
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Get the HTML content
        html_content = response.text
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

    return BeautifulSoup(html_content, 'html.parser')

In [7]:
def extract_job_urls_page(soup):
    # select all job cards on page i.e. <a> elements with class jobcardStyle1 that also have an href attribute
    job_card_list = soup.select("a.jobcardStyle1[href]")
    
    # grab all job urls on page
    job_url_list = []
    for job_card in job_card_list:
        job_url = job_card["href"]
        job_url_list.append(job_url)

    return job_url_list

In [8]:
# repeat for 9 pages
for page_num in range(9): 
    job_page_url = job_board_url + f'?page={page_num+2}'

    # get soup object
    soup = get_page_soup(job_page_url)
    # extract job urls
    new_job_url_list = extract_job_urls_page(soup)

    job_url_list = job_url_list + new_job_url_list

In [9]:
len(job_url_list)

260

In [10]:
job_url_list

['https://aijobs.ai/job/ai-agent-engineer-13',
 'https://aijobs.ai/job/full-time-ai-developer-stealth-saas-project-ai-first-high-growth',
 'https://aijobs.ai/job/full-time-ai-engineer-for-long-term-role-in-self-funded-company',
 'https://aijobs.ai/job/online-data-research',
 'https://aijobs.ai/job/full-stack-software-engineer-13',
 'https://aijobs.ai/job/ai-prompt-engineer-part-time-flexible-schedule',
 'https://aijobs.ai/job/senior-software-engineer-agents',
 'https://aijobs.ai/job/lead-software-engineer-ml-backend',
 'https://aijobs.ai/job/staff-engineer-software-autonomy-applications-r3166-3',
 'https://aijobs.ai/job/ml-application-security-engineer',
 'https://aijobs.ai/job/solutions-engineering-senior-manager',
 'https://aijobs.ai/job/ml-engineer-llm-evaluation',
 'https://aijobs.ai/job/ml-engineer-llm-safety',
 'https://aijobs.ai/job/ml-engineer-llm-privacy',
 'https://aijobs.ai/job/ml-research-scientist-llm-safety',
 'https://aijobs.ai/job/staff-software-engineer-34',
 'https://

### 2) extract info from one listing

In [11]:
# get soup object for one job
soup = get_page_soup(job_url_list[1])

#### pull json data

In [12]:
# Find the script tag containing JSON-LD
script_tag = soup.find('script', type='application/ld+json')

# Load the JSON content
if script_tag:
    job_data = json5.loads(script_tag.string)
    
    # Extract relevant fields
    company_name = job_data['hiringOrganization']['name']
    job_title = job_data['title']
    job_description = job_data['description']
    salary_min = job_data['baseSalary']['value']['minValue']
    salary_max = job_data['baseSalary']['value']['maxValue']
    location = job_data['jobLocation']['address']['addressRegion']

    # Print extracted data
    print(f"Company Name: {company_name}")
    print(f"Job Title: {job_title}")
    print(f"Job Description: {job_description[:200]}...")
    print(f"Salary Range: {salary_min} - {salary_max} USD")
    print(f"Location: {location}")

Company Name: Tyron Humphris
Job Title: Full-Time AI Developer – Stealth SaaS Project (AI-First, High Growth)
Job Description: <p>We’re building a cutting-edge B2B SaaS platform that leverages AI agents, real-time data aggregation, and multi-source intelligence to surface high-value business insights. We’re looking for a self...
Salary Range: 250001 - 250001 USD
Location: Florida


### 3) extract info from all listings

In [13]:
def extract_job_info(url):
    """
    Extracts job information from a given job listing URL.

    Args:
        url (str): The URL of the job listing.

    Returns:
        dict: A dictionary containing the following key-value pairs:
            - 'company_name' (str): Name of the hiring organization.
            - 'job_title' (str): Title of the job.
            - 'job_description' (str): Detailed description of the job.
            - 'salary_min' (float or str): Minimum salary offered for the job.
            - 'salary_max' (float or str): Maximum salary offered for the job.
               Returns 'N/A' if salary information is unavailable.
    """
    try:
        # get soup object
        soup = get_page_soup(url)
        
        # Find the script tag containing JSON-LD
        script_tag = soup.find('script', type='application/ld+json')
        
        if script_tag:
            job_data = json5.loads(script_tag.string)
            
            # Extract relevant fields with default values if not present
            company_name = job_data.get('hiringOrganization', {}).get('name', 'N/A')
            job_title = job_data.get('title', 'N/A')
            job_description = job_data.get('description', 'N/A')
            salary_data = job_data.get('baseSalary', {}).get('value', {})
            salary_min = salary_data.get('minValue', 'N/A')
            salary_max = salary_data.get('maxValue', 'N/A')
            location = job_data.get('jobLocation', {}).get('address', {}).get('addressRegion', 'N/A')
            
            return {
                'company_name': company_name,
                'job_title': job_title,
                'job_description': job_description,
                'salary_min': salary_min,
                'salary_max': salary_max,
                'location': location
            }
        else:
            return {'error': 'No JSON-LD script found in the page'}
    
    except requests.RequestException as e:
        return {'error': f"Request failed: {e}"}
    
    except Exception as e:
        return {'error': f"An unexpected error occurred: {e}"}

In [14]:
# extract job info from all job urls
job_info_list = []
raw_JSON_list = []

for job_url in job_url_list:
    # extract job info
    job_info = extract_job_info(job_url)

    # store results in list if no errors occured
    try:
        print(job_info["job_title"])
        job_info_list.append(job_info)
    except:
        print(f"Could not extract info from: {job_url}.")
        
    # save raw JSON-LD
    soup = get_page_soup(job_url)
    script_tag = soup.find('script', type='application/ld+json')
    
    raw_JSON_list.append({"job_url": job_url, "JSON": script_tag})

Could not extract info from: https://aijobs.ai/job/ai-agent-engineer-13.
Full-Time AI Developer – Stealth SaaS Project (AI-First, High Growth)
Could not extract info from: https://aijobs.ai/job/full-time-ai-engineer-for-long-term-role-in-self-funded-company.
Could not extract info from: https://aijobs.ai/job/online-data-research.
Could not extract info from: https://aijobs.ai/job/full-stack-software-engineer-13.
AI Prompt Engineer - PART TIME, FLEXIBLE SCHEDULE
Could not extract info from: https://aijobs.ai/job/senior-software-engineer-agents.
Could not extract info from: https://aijobs.ai/job/lead-software-engineer-ml-backend.
Could not extract info from: https://aijobs.ai/job/staff-engineer-software-autonomy-applications-r3166-3.
Could not extract info from: https://aijobs.ai/job/ml-application-security-engineer.
Could not extract info from: https://aijobs.ai/job/solutions-engineering-senior-manager.
Could not extract info from: https://aijobs.ai/job/ml-engineer-llm-evaluation.
Could

### 4) Store data in Pandas dataframe

In [15]:
df = pd.DataFrame(job_info_list)
df.head()

Unnamed: 0,company_name,job_title,job_description,salary_min,salary_max,location
0,Tyron Humphris,Full-Time AI Developer – Stealth SaaS Project ...,<p>We’re building a cutting-edge B2B SaaS plat...,250001,250001,Florida
1,Meir Levy,"AI Prompt Engineer - PART TIME, FLEXIBLE SCHEDULE",<p>Looking for an AI Prompt Engineer OR a very...,50,50,California
2,ByteDance,Machine Learning Engineer Graduate (E-Commerce...,ResponsibilitiesTeam Introduction:Join the E-c...,237500,237500,
3,ByteDance,Research Scientist in Quantum Chemistry and Ma...,ResponsibilitiesAbout the team:Our team at Byt...,355000,355000,
4,ByteDance,Machine Learning Engineer Graduate (TikTok E-C...,ResponsibilitiesTeam Introduction:Our team is ...,237500,237500,


In [17]:
df.shape

(72, 6)

In [18]:
# save to file
df.to_csv("data/ai_job_data.csv", index=False)

In [19]:
# save raw JSON
with open("data/raw_json_list.jsonl", "w", encoding="utf-8") as f:
    for item in raw_JSON_list:
        tag = item.get("JSON")
        parsed = None
        if tag:
            text = tag.string or tag.get_text(strip=True)
            if text:
                try:
                    parsed = json5.loads(text)  # parsed dict
                except Exception:
                    parsed = text  # fallback to raw string
        rec = {"job_url": item.get("job_url"), "jsonld": parsed}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")