In [1]:
category_dict = {
    "A": "Ability & Aptitude",
    "B": "Biodata & Situational Judgement",
    "C": "Competencies",
    "D": "Development & 360",
    "E": "Assessment Exercises",
    "K": "Knowledge & Skills",
    "P": "Personality & Behavior",
    "S": "Simulations"
}
def convert_test_type_string(test_str):
    if test_str.startswith("Test Type:"):
        codes = test_str.replace("Test Type:", "").strip()
        full_names = [category_dict.get(char, f"Unknown({char})") for char in codes]
        return "Test Type: " + ", ".join(full_names)
    return test_str

In [11]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import re

# Inside your loop


BASE_URL = "https://www.shl.com"
CATALOG_URL = BASE_URL + "/solutions/products/product-catalog/?start="

headers = {
    "User-Agent": "Mozilla/5.0"
}

def get_product_links(URL):
    remote=[]
    adap=[]
    links = []
    for page_url in URL:      
        response = requests.get(page_url, headers=headers)
        time.sleep(4)
        soup = BeautifulSoup(response.text, 'html.parser')

        div=soup.find("div", class_="custom__table-responsive")
        trs=div.find_all("tr")
        for tr in trs:
            td=tr.find("td", class_="custom__table-heading__title") 
            if not td:
                continue
            rem=tr.find_all("td", class_="custom__table-heading__general")
            if not rem:
                continue            
            a_tag = td.find("a")
            if a_tag:
                href = a_tag['href']
                title = a_tag.text.strip()
                full_url = BASE_URL + href
                links.append((title, full_url))
            td1 = rem[0]
            td2 = rem[1]
            span=td1.find("span", class_="catalogue__circle -yes")
            if span:
                remote.append(1)
            else:
                remote.append(0)    
            span= td2.find("span", class_="catalogue__circle -yes")
            if span:
                adap.append(1)
            else:
                adap.append(0)
    return links,remote,adap

def get_description(product_url):
    response = requests.get(product_url, headers=headers)
    if response.status_code != 200:
        print(f"❌ Failed to fetch {product_url}")
        return "Description not available."

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all description sections based on partial class match
    desc_sections = soup.find_all("div", class_=lambda c: c and c.startswith("product-catalogue-training-calendar__row"))

    description_parts = ""
    for section in desc_sections:
        paragraphs = section.find_all("p")
        for p in paragraphs:
            check = section.find("h4")
            if not check:
                continue
            if check.get_text(strip=True) == "Downloads":
                continue
            text = p.get_text(strip=True)
            text=convert_test_type_string(text)
            if text:   
                text = re.sub(r'\s+', ' ', text).strip()
                if(text=="Remote Testing:"):
                    continue
                description_parts+=text+" "

    final_description = description_parts if description_parts else "Description not found."
    return final_description

def main():
    URL=[]
    for i in range(0,373,12):
        url=""
        url=CATALOG_URL+str(i)
        url+="&type=1"
        URL.append(url)
    for i in range(0,133,12):
        url=""
        url=CATALOG_URL+str(i)
        url+="&type=2"
        URL.append(url)    
    products,remote,adap = get_product_links(URL)
    all_data = []
    i=0
    for title, url in products:
        print(f"Scraping: {title}")
        description = get_description(url)    
        if remote[i]==1:
            remo="Yes"
            description+=" Remote Testing"
        else:
            remo="No"
        if adap[i]==1:  
            ada="Yes"
            description+=" Adaptive/IRT"
        else:
            ada="No"    
        i+=1    
        all_data.append([title, f"{url}",remo,ada, description])  # Be nice to the server

    # Save to CSV
    with open("shl_products.csv", "w", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "URL","Remote Testing","ADAPTIVE/IRT","Description"])
        writer.writerows(all_data)

    print("Scraping completed. Data saved to shl_products.csv")

if __name__ == "__main__":
    main()


Scraping: Account Manager Solution
Scraping: Administrative Professional - Short Form
Scraping: Agency Manager Solution
Scraping: Apprentice + 8.0 Job Focused Assessment
Scraping: Apprentice 8.0 Job Focused Assessment
Scraping: Bank Administrative Assistant - Short Form
Scraping: Bank Collections Agent - Short Form
Scraping: Bank Operations Supervisor - Short Form
Scraping: Bilingual Spanish Reservation Agent Solution
Scraping: Bookkeeping, Accounting, Auditing Clerk Short Form
Scraping: Branch Manager - Short Form
Scraping: Cashier Solution
Scraping: Adobe Experience Manager (New)
Scraping: Adobe Photoshop CC
Scraping: Aeronautical Engineering (New)
Scraping: Aerospace Engineering (New)
Scraping: Agile Software Development
Scraping: Agile Testing (New)
Scraping: AI Skills
Scraping: Amazon Web Services (AWS) Development (New)
Scraping: Android Development (New)
Scraping: Angular 6 (New)
Scraping: AngularJS (New)
Scraping: Apache Hadoop (New)
Scraping: Apache Hadoop Extensions (New)
Scr