# Example 1: Scraping AI Job Board
## ABB #7 - Session 1

Code authored by: Shaw Talebi

### imports

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd

### 1) get list of (unique) job urls

In [2]:
job_url_list = []

for i in range(5):
    # construct url
    url = f"https://aijobs.ai/engineer?location=United%20States&page={i+1}"

    # perform get request
    response = requests.get(url)

    # parse html
    soup = BeautifulSoup(response.text, "html.parser")

    # grab all job urls
    job_cards = soup.find_all("a", class_="jobcardStyle1")
    job_urls_temp = sorted({a["href"] for a in job_cards if a.get("href")})

    # add new urls to list
    job_url_list = job_url_list + job_urls_temp

### 2) scrape job data from url

Data extracted:
- Job Title
- Org
- Salary
- Location
- Job Description
- Job Type

In [27]:
def extract_job_data(soup: BeautifulSoup) -> dict:
    """
    Extracts job data from an AI job posting page.
    Returns keys: title, org, min_salary, max_salary, location, description, job_type
    """

    def first_nonempty(*vals):
        for v in vals:
            if v:
                if isinstance(v, str) and v.strip():
                    return v.strip()
                if not isinstance(v, str):
                    return v
        return ""

    def text_or_none(el):
        return el.get_text(" ", strip=True) if el else None

    # -------- 1) Try JSON-LD (JobPosting) --------
    jp = None
    for tag in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(tag.string or tag.text or "")
        except json.JSONDecodeError:
            continue

        # Normalize to list
        candidates = data if isinstance(data, list) else [data]

        # Find the JobPosting object
        for obj in candidates:
            if isinstance(obj, dict) and obj.get("@type") == "JobPosting":
                jp = obj
                break
        if jp:
            break

    title = org = location = description = job_type = ""
    min_salary = max_salary = ""

    if jp:
        title = jp.get("title") or ""
        org = (
            (jp.get("hiringOrganization") or {}).get("name")
            or (jp.get("identifier") or {}).get("name")
            or ""
        )

        jt = jp.get("employmentType")
        if isinstance(jt, list):
            job_type = ", ".join(jt)
        else:
            job_type = jt or ""

        addr = (((jp.get("jobLocation") or {}).get("address")) or {})
        location = first_nonempty(
            " ".join([addr.get("addressLocality",""), addr.get("addressRegion",""), addr.get("addressCountry","")]).strip(),
            addr.get("addressRegion"),
            addr.get("addressCountry"),
        )

        base_salary = jp.get("baseSalary") or {}
        value = base_salary.get("value") or {}
        currency = base_salary.get("currency") or ""
        min_v = value.get("minValue")
        max_v = value.get("maxValue")
        unit = value.get("unitText") or ""
        if min_v:
            min_salary = f"{currency} {min_v} {unit}".strip()
        if max_v:
            max_salary = f"{currency} {max_v} {unit}".strip()

        raw_desc = jp.get("description") or ""
        if raw_desc:
            description = BeautifulSoup(raw_desc, "html.parser").get_text(" ", strip=True)

    # -------- 2) Fallbacks from visible DOM --------
    title = first_nonempty(
        title,
        text_or_none(soup.select_one(".post-main-title2")),
        text_or_none(soup.find("h1"))
    )

    org = first_nonempty(
        org,
        text_or_none(soup.select_one(".post-info2 a p span:last-child")),
        text_or_none(soup.select_one('[itemprop="hiringOrganization"]')),
    )

    job_type = first_nonempty(
        job_type,
        text_or_none(soup.select_one(".job-type .tw-uppercase, .job-type .f-size-14")),
        text_or_none(soup.find(string=re.compile(r"Job Type", re.I)).find_parent().find_next("span")) if soup.find(string=re.compile(r"Job Type", re.I)) else None,
    )

    loc_block = soup.select_one(".job-type .remote p") or soup.find(string=re.compile(r"Location", re.I))
    if not location:
        if loc_block and hasattr(loc_block, "get_text"):
            location = loc_block.get_text(" ", strip=True)
        elif loc_block and isinstance(loc_block, str):
            lab = soup.find(string=re.compile(r"Location", re.I))
            sibp = lab.find_parent().find_next("p") if lab else None
            location = text_or_none(sibp) or ""

    # Salary fallback: try to detect numeric range
    if not (min_salary or max_salary):
        text = soup.get_text(" ", strip=True)
        m = re.search(r"\$([\d,]+)[kK]?\s*[-–to]+\s*\$([\d,]+)[kK]?", text, flags=re.I)
        if m:
            min_salary = f"${m.group(1)}"
            max_salary = f"${m.group(2)}"
        else:
            single = re.search(r"\$([\d,]+[kK]?)", text)
            if single:
                min_salary = f"${single.group(1)}"

    if not description:
        desc_container = soup.select_one(".job-description-container")
        description = text_or_none(desc_container) or ""

    def clean(s):
        return re.sub(r"\s+", " ", s).strip()

    result = {
        "job_title": clean(title),
        "organization": clean(org),
        "min_salary": clean(min_salary),
        "max_salary": clean(max_salary),
        "location": clean(location),
        "job_description": clean(description),
        "job_type": clean(job_type),
    }
    return result


In [39]:
job_data_list = []

for job_url in job_url_list:
    response = requests.get(job_url)
    soup = BeautifulSoup(response.text, "html.parser")
    job_data = extract_job_data(soup)

    job_data_list.append(job_data)

In [40]:
len(job_data_list)

105

### 3) data cleaning

In [41]:
df = pd.DataFrame(job_data_list)

In [42]:
df

Unnamed: 0,job_title,organization,min_salary,max_salary,location,job_description,job_type
0,"Developer Experience Engineer, AI Compiler",Tenstorrent,$100,$500,"Austin, Texas, United States",Tenstorrent is leading the industry on cutting...,Full Time
1,Forward Deployed Engineer (AI Agent),Cresta,"$150,000","$250,000","United States, Remote",Cresta is on a mission to turn every customer ...,Full Time
2,Fullstack Engineer,Lightning AI,"$120,000","$250,000","San Francisco, California, United States",Who We Are Lightning AI is the company reimagi...,Full Time
3,Gameplay Engineer (AI),2K,"$170,000","$215,000","Sparks Glencoe, Maryland, United States",Firaxis Games is seeking a motivated gameplay ...,Full Time
4,GTM AI Engineer,Motive,"$107,000","$147,000",United States - Remote,Who we are: Motive empowers the people who run...,Full Time
...,...,...,...,...,...,...,...
100,Staff AI Engineer,WillowTree,"$150,000","$174,000","Boston, Massachusetts, United States; Charlott...",AI Engineer (Staff) Who We Are Welcome to TELU...,Full Time
101,Staff AI Engineer,Apollo.io,"$200,000","$280,000","Remote, United States",Apollo.io is the leading go-to-market solution...,Full Time
102,"Staff Computer Vision Engineer, Perception",Metropolis,"$220,000","$250,000","Seattle, Washington, United States",Who we are Metropolis is an artificial intelli...,Full Time
103,"Staff Engineer, AI/ML Software Compiler","Samsung Semiconductor, Inc.","$157,000","$243,000","San Jose, California, United States",Please Note: To provide the best candidate exp...,Full Time


### 4) save it as a .csv

In [43]:
df.to_csv('data/job_data.csv')