# Job Board Scraping Lab

In this lab you will first see a minimal but fully functional code snippet to scrape the LinkedIn Job Search webpage. You will then work on top of the example code and complete several chanllenges.

### Some Resources 

- [Requests library](http://docs.python-requests.org/en/master/#the-user-guide) documentation 
- [Beautiful Soup Doc](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
- [Urllib](https://docs.python.org/3/library/urllib.html#module-urllib)
- [re lib](https://docs.python.org/3/library/re.html)
- [Scrapy](https://scrapy.org/)
- [List of HTTP status codes](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes)
- [HTML basics](http://www.simplehtmlguide.com/cheatsheet.php)
- [CSS basics](https://www.cssbasics.com/#page_start)

In [11]:
# Import the required libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests
from urllib.parse import quote_plus
from requests.exceptions import RequestException
import time
import re

def scrape_linkedin_job_search(
    keywords,
    num_pages=1,
    country=None,
    num_days=None,
    pause=1.0,
    timeout=10,
    fetch_seniority=False,
    per_job_pause=0.5,
):
    # Base URL (let requests handle query params)
    BASE_URL = "https://www.linkedin.com/jobs/search/"

    # Build base params. Let requests encode them; don't pre-encode spaces.
    params_base = {"keywords": keywords}
    if country:
        # LinkedIn accepts a plain location string; geoId is more reliable but not public.
        params_base["location"] = country
    if num_days is not None and isinstance(num_days, (int, float)) and num_days > 0:
        seconds = int(num_days * 86400)
        params_base["f_TPR"] = f"r{seconds}"

    # Headers to look like a browser
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Connection": "keep-alive",
    }

    def extract_job_id_and_url(card):
        job_id = None
        job_url = None
        link = card.select_one("a.base-card__full-link") or card.find("a")
        if link and link.get("href"):
            job_url = link.get("href")
            m = re.search(r"/jobs/view/(\d+)", job_url)
            if m:
                job_id = m.group(1)
            else:
                m = re.search(r"currentJobId=(\d+)", job_url)
                if m:
                    job_id = m.group(1)
        if not job_id:
            urn = card.get("data-entity-urn") or card.get("data-urn")
            if urn:
                m = re.search(r"urn:li:jobPosting:(\d+)", urn)
                if m:
                    job_id = m.group(1)
        if not job_id:
            # Other possible attributes
            for attr in ("data-id", "data-job-id", "data-job-id"):  # duplicates okay
                if card.get(attr) and str(card.get(attr)).isdigit():
                    job_id = str(card.get(attr))
                    break
        return job_id, job_url

    def fetch_job_seniority(session, job_id, job_url):
        # Prefer the discovered URL; otherwise, build from job_id
        url = job_url or (f"https://www.linkedin.com/jobs/view/{job_id}/" if job_id else None)
        if not url:
            return None
        try:
            resp = session.get(url, headers=headers, timeout=timeout)
        except RequestException:
            return None
        if resp.status_code != 200:
            return None
        soup = BeautifulSoup(resp.text, "html.parser")

        # Newer markup: description__job-criteria
        for li in soup.select("li.description__job-criteria-item"):
            header = li.select_one("h3")
            value = li.select_one("span.description__job-criteria-text, span.job-criteria__text")
            if header and "seniority" in header.get_text(strip=True).lower():
                return value.get_text(strip=True) if value else None

        # Older markup: job-criteria__item
        for li in soup.select("li.job-criteria__item"):
            header = li.select_one("h3.job-criteria__subheader") or li.find("h3")
            value = li.select_one("span.job-criteria__text")
            if header and "seniority" in header.get_text(strip=True).lower():
                return value.get_text(strip=True) if value else None

        # Fallback: search for any element mentioning Seniority
        text_hit = soup.find(string=lambda t: isinstance(t, str) and "seniority" in t.lower())
        if text_hit:
            # Try the parent block
            parent = text_hit.parent
            if parent:
                nxt = parent.find_next(string=True)
                if isinstance(nxt, str) and nxt.strip() and "seniority" not in nxt.lower():
                    return nxt.strip()
        return None

    rows = []
    session = requests.Session()

    for page in range(max(1, int(num_pages))):
        params = dict(params_base)
        # LinkedIn paginates with 'start' offset of 25 per page
        params["start"] = page * 25
        try:
            resp = session.get(BASE_URL, params=params, headers=headers, timeout=timeout)
        except RequestException as e:
            print(f"Request error on page {page+1}: {e}")
            break

        if resp.status_code != 200:
            # 999/403 are common when blocked
            print(f"HTTP {resp.status_code} for page {page+1}. Anti-bot likely. Stopping.")
            break

        soup = BeautifulSoup(resp.text, "html.parser")

        # Try current selectors (as of recent LinkedIn markup). Fallback to old selectors if needed
        cards = soup.select("div.base-card")
        if not cards:
            cards = soup.select("div.result-card__contents")

        if not cards:
            print(f"No job cards found on page {page+1}. HTML changed or blocked.")
            break

        for card in cards:
            # Extract fields
            title_el = card.select_one("h3.base-search-card__title") or card.find("h3")
            company_el = card.select_one("h4.base-search-card__subtitle") or card.find("h4")
            location_el = card.select_one("span.job-search-card__location") or card.find(
                "span", attrs={"class": "job-result-card__location"}
            )

            title = title_el.get_text(strip=True) if title_el else None
            company = company_el.get_text(strip=True) if company_el else None
            location = location_el.get_text(strip=True) if location_el else None

            job_id, job_url = extract_job_id_and_url(card)
            seniority = None
            if fetch_seniority:
                seniority = fetch_job_seniority(session, job_id, job_url)
                time.sleep(max(0.0, float(per_job_pause)))

            if any([title, company, location, job_id]):
                rows.append({
                    "Title": title,
                    "Company": company,
                    "Location": location,
                    "JobId": job_id,
                    "JobUrl": job_url,
                    "SeniorityLevel": seniority,
                })

        # Be polite; avoid hammering
        time.sleep(max(0.0, float(pause)))

    # Build DataFrame with optional columns
    columns = ["Title", "Company", "Location", "JobId", "JobUrl", "SeniorityLevel"]
    data = pd.DataFrame(rows, columns=columns) if rows else pd.DataFrame(columns=columns)
    return data

In [None]:
# Example to call the function
results = scrape_linkedin_job_search('data analysis', num_pages=1)
results.head()

Unnamed: 0,Title,Company,Location,JobId,JobUrl,SeniorityLevel
0,"Data Analyst, Disney Advertising Data and Meas...",The Walt Disney Company,"Santa Monica, CA",4294362961,https://www.linkedin.com/jobs/view/data-analys...,
1,Data Analyst,James Search Group,United States,4294394193,https://www.linkedin.com/jobs/view/data-analys...,
2,Business Data Analyst,"TechnoSphere, Inc.","Texas, United States",4292789745,https://www.linkedin.com/jobs/view/business-da...,
3,Junior Data Analyst,Tech Consulting,"Boston, MA",4294061386,https://www.linkedin.com/jobs/view/junior-data...,
4,Data & Analytics Intern,Lensa,"Plano, TX",4294308298,https://www.linkedin.com/jobs/view/data-analyt...,


## Challenge 1

The first challenge for you is to update the `scrape_linkedin_job_search` function by adding a new parameter called `num_pages`. This will allow you to search more than 25 jobs with this function. Suggested steps:

1. Go to https://www.linkedin.com/jobs/search/?keywords=data%20analysis in your browser.
1. Scroll down the left panel and click the page 2 link. Look at how the URL changes and identify the page offset parameter.
1. Add `num_pages` as a new param to the `scrape_linkedin_job_search` function. Update the function code so that it uses a "for" loop to retrieve several pages of search results.
1. Test your new function by scraping 5 pages of the search results.

Hint: Prepare for the case where there are less than 5 pages of search results. Your function should be robust enough to **not** trigger errors. Simply skip making additional searches and return all results if the search already reaches the end.

In [None]:
# your code here
keywords = 'data analysis'

# Increase pause a bit to be polite and reduce blocking risk
results_5 = scrape_linkedin_job_search(keywords, num_pages=5, pause=1.5, timeout=12)

print(f"Total rows returned: {len(results_5)}")
results_5.head(10)

Total rows returned: 294


Unnamed: 0,Title,Company,Location,JobId,JobUrl,SeniorityLevel
0,"Data Analyst, Disney Advertising Data and Meas...",The Walt Disney Company,"Santa Monica, CA",4294362961,https://www.linkedin.com/jobs/view/data-analys...,
1,Data Analyst,James Search Group,United States,4294394193,https://www.linkedin.com/jobs/view/data-analys...,
2,Business Data Analyst,"TechnoSphere, Inc.","Texas, United States",4292789745,https://www.linkedin.com/jobs/view/business-da...,
3,Junior Data Analyst,Tech Consulting,"Boston, MA",4294061386,https://www.linkedin.com/jobs/view/junior-data...,
4,Data & Analytics Intern,Lensa,"Plano, TX",4294308298,https://www.linkedin.com/jobs/view/data-analyt...,
5,Data Analyst,Pentangle Tech Services | P5 Group,United States,4294378520,https://www.linkedin.com/jobs/view/data-analys...,
6,Data Analyst,Intracruit Solutions,Dallas-Fort Worth Metroplex,4294392103,https://www.linkedin.com/jobs/view/data-analys...,
7,Data Analytics Intern,Lensa,"Indianapolis, IN",4294308348,https://www.linkedin.com/jobs/view/data-analyt...,
8,Data Engineer Intern,ProArch,"Atlanta, GA",4294342548,https://www.linkedin.com/jobs/view/data-engine...,
9,DATA SCIENCE INTERNSHIP,Lensa,"Tallahassee, FL",4294306797,https://www.linkedin.com/jobs/view/data-scienc...,


## Challenge 2

Further improve your function so that it can search jobs in a specific country. Add the 3rd param to your function called `country`. The steps are identical to those in Challenge 1.

In [14]:
# your code here
keywords = 'data analysis'
country = 'Portugal'

results_country = scrape_linkedin_job_search(
    keywords,
    num_pages=3,      # up to 3 pages
    country=country,  # country filter
    pause=1.5,
    timeout=12,
)

print(f"Country: {country} | Total rows: {len(results_country)}")
results_country.head(10)

Country: Portugal | Total rows: 180


Unnamed: 0,Title,Company,Location,JobId,JobUrl,SeniorityLevel
0,Junior consumer insight Analyst,NielsenIQ,Lisbon Metropolitan Area,4267762794,https://pt.linkedin.com/jobs/view/junior-consu...,
1,Data Scientist,Landskill,"Lisbon, Lisbon, Portugal",4190232149,https://pt.linkedin.com/jobs/view/data-scienti...,
2,Junior market research Consultant,NielsenIQ,Lisbon Metropolitan Area,4267770236,https://pt.linkedin.com/jobs/view/junior-marke...,
3,Data Analyst - Power BI/QlikSense,Fujitsu,Portugal,4292373541,https://pt.linkedin.com/jobs/view/data-analyst...,
4,Junior BI Analyst,TransPerfect,"Lisbon, Portugal",4268577855,https://pt.linkedin.com/jobs/view/junior-bi-an...,
5,Business Analyst,Landskill,"Lisbon, Lisbon, Portugal",4190233134,https://pt.linkedin.com/jobs/view/business-ana...,
6,Junior Healthcare Data Analyst,knokcare,"Porto, Portugal",4290916404,https://pt.linkedin.com/jobs/view/junior-healt...,
7,Data Analyst,Irium Portugal,"Alfragide, Lisbon, Portugal",4218725954,https://pt.linkedin.com/jobs/view/data-analyst...,
8,Data Analyst,Inetum,"Lisbon, Portugal",4289242209,https://pt.linkedin.com/jobs/view/data-analyst...,
9,Product Analyst,Paydock,Portugal,4281488535,https://pt.linkedin.com/jobs/view/product-anal...,


## Challenge 3

Add the 4th param called `num_days` to your function to allow it to search jobs posted in the past X days. Note that in the LinkedIn job search the searched timespan is specified with the following param:

```
f_TPR=r259200
```

The number part in the param value is the number of seconds. 259,200 seconds equal to 3 days. You need to convert `num_days` to number of seconds and supply that info to LinkedIn job search.

In [15]:
# your code here
keywords = 'data analysis'
country = 'Portugal'
num_days = 3          # last 3 days

results_recent = scrape_linkedin_job_search(
    keywords,
    num_pages=3,
    country=country,
    num_days=num_days,
    pause=1.5,
    timeout=12,
)

print(f"Last {num_days} days | Country: {country} | Total rows: {len(results_recent)}")
results_recent.head(10)

Last 3 days | Country: Portugal | Total rows: 177


Unnamed: 0,Title,Company,Location,JobId,JobUrl,SeniorityLevel
0,Junior BI Analyst,TransPerfect,"Lisbon, Portugal",4268577855,https://pt.linkedin.com/jobs/view/junior-bi-an...,
1,Analyst (Portugal),Cleerly,"Lisboa, Lisbon, Portugal",4138246067,https://pt.linkedin.com/jobs/view/analyst-port...,
2,Process & Business Improvement,Airbus,"Lisbon, Lisbon, Portugal",4268550490,https://pt.linkedin.com/jobs/view/process-busi...,
3,Senior Data & Business Analyst,LUZA Group,"Lisbon, Lisbon, Portugal",4294203406,https://pt.linkedin.com/jobs/view/senior-data-...,
4,"Finance and Business Analyst, Supply Chain",Bose Corporation,"Lisboa, Lisbon, Portugal",4281062468,https://pt.linkedin.com/jobs/view/finance-and-...,
5,Power BI Specialist,Adentis Portugal,"Lisbon, Portugal",4290869610,https://pt.linkedin.com/jobs/view/power-bi-spe...,
6,Business Analyst - Junior e Mid,Irium Portugal,"Lisbon, Lisbon, Portugal",4294148261,https://pt.linkedin.com/jobs/view/business-ana...,
7,Business Analyst,emagine - Portugal,Lisbon Metropolitan Area,4292787356,https://pt.linkedin.com/jobs/view/business-ana...,
8,Business Analyst,TENDAM,"Lisbon, Portugal",4291761439,https://pt.linkedin.com/jobs/view/business-ana...,
9,Data Analyst,Decskill,Lisbon Metropolitan Area,4291762839,https://pt.linkedin.com/jobs/view/data-analyst...,


## Bonus Challenge

Allow your function to also retrieve the "Seniority Level" of each job searched. Note that the Seniority Level info is not in the initial search results. You need to make a separate search request for each job card based on the `currentJobId` value which you can extract from the job card HTML.

After you obtain the Seniority Level info, update the function and add it to a new column of the returned dataframe.

In [None]:
# your code here
keywords = 'data analysis'
country = 'Portugal'

results_with_seniority = scrape_linkedin_job_search(
    keywords,
    num_pages=2,
    country=country,
    num_days=3,
    fetch_seniority=True,
    pause=1.5,
    per_job_pause=0.8,
    timeout=12,
)

print(f"Rows with attempted seniority: {len(results_with_seniority)}")
# Show only rows where we actually captured a SeniorityLevel (if any)
results_with_seniority[results_with_seniority['SeniorityLevel'].notna()].head(10)