In [None]:
'''
function 1 to scrape the whole page for all the assessment links
2 for scraping a single assessment and extracting its details like url
'''

In [22]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import json
import regex as re

CHROMEDRIVER_PATH = r"C:\Users\Lenovo\Desktop\My_Notebooks\SHL AI Research Intern Assessment\chromedriver-win64\chromedriver.exe"
START_URL = "https://www.shl.com/solutions/products/product-catalog/"

options = webdriver.ChromeOptions()
# options.add_argument('--headless')     
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
service = Service(CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)

# -------- Function to scrape a single assessment page ---------
def scrape_assessment(url, adaptive_support):
    print("Scraping a single assessment!")
    driver.get(url)
    time.sleep(3)

    #------ Name ------
    # name_heading = driver.find_element(By.TAG_NAME, "h1")
    # name = name_heading.text
    # print(f"name: {name}")

    # ---- Description ----
    try:
        # print("trying for desciption")
        description_heading = driver.find_element(By.XPATH, "//h4[text()='Description']")
        description = description_heading.find_element(By.XPATH, "following-sibling::p[1]").text

    except Exception as e:
        print(f"An error occurred: {e}")
        description = "N/A"

    # ---- Duration ----
    try:
        # details_heading = driver.find_element(By.XPATH, "//h4[text()='Assessment length']")
        details_heading=driver.find_element(By.XPATH, "//h4[contains(text(), 'Assessment length')]")
        details_text = details_heading.find_element(By.XPATH, "following-sibling::p[1]").text
        # print(f"details text: {details_text}")
        match = re.search(r'\d+', details_text)
        # print(f"match: {match}")
        duration = int(match.group()) if match else "N/A"
        
        # details_headings = driver.find_element(By.TAG_NAME, "h4").text
        # for heading in details_headings:
        #     if heading and heading.strip() in "Assessment length":
        #         print("found assessment length!")
        #         details_text = heading.find_element(By.XPATH, "following-sibling::p[1]").text
        #         match = re.search(r'\d+', details_text)
        #         duration = int(match.group()) if match else "N/A"
        #         if duration:
        #             print("found duration!")
        #     else:
        #         print("couldn't find duration")
    except Exception as e:
        print(f"An error occurred: {e}")
        duration = "N/A"

    # ---- Remote & Test Type ----
    try:
        container = driver.find_elements(By.CSS_SELECTOR, "p.product-catalogue__small-text")

        remote_support = "N/A"
        test_type = []
        test_type_map = {
            "A": "Ability & Aptitude",
            "B": "Biodata & Situational Judgement",
            "C": "Competencies",
            "D": "Development & 360",
            "E": "Assessment Exercises",
            "K": "Knowledge & Skills",
            "P": "Personality & Behavior",
            "S": "Simulations"
        }

        for p in container:
            p_text = p.text.strip()

            # -------- Remote Testing --------
            # if p_text.startswith("Remote Testing:"):
            if "Remote Testing:" in p_text:
                # print("remote tesing section found")
                try:
                    span = p.find_element(By.TAG_NAME, "span")
                    span_class = span.get_attribute("class")
                    remote_support = "Yes" if "-yes" in span_class else "No"
                except Exception as e:
                    print(f"An error occurred: {e}")
                    remote_support = "N/A"

            # -------- Test Type --------
            # elif p_text.startswith("Test Type:"):
            elif "Test Type:" in p_text:
                # print("test type section found")
                try:
                    keys = p.find_elements(By.CLASS_NAME, "product-catalogue__key")
                    test_type_letters = [key.text.strip() for key in keys if key.text.strip()]
                    test_type = [test_type_map.get(letter, f"Unknown ({letter})") for letter in test_type_letters]
                except Exception as e:
                    print(f"An error occurred: {e}")
                    test_type = []
            # else:
            #     print("no remote or test type section found!")
    except Exception as e:
        print(f"An error occurred: {e}")
        remote_support = "N/A"
        test_type = []


    return {
        "url": url,
        "adaptive_support": adaptive_support,
        "description": description,
        "duration": duration,
        "remote_support": remote_support,
        "test_type": test_type
    }


# -------- Function to scrape a catalog page --------
def scrape_catalog_page(url):
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody tr"))
    )

    assessments_data = []
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    current_section = None
    links_to_scrape = []

    for row in rows:
        try:
            # Section headers
            header = row.find_element(By.CLASS_NAME, "custom__table-heading__title").text
            if header in "Individual Test Solutions":
                current_section = header
                continue
        except Exception as e:
            print(f"An error occurred: {e}")
            pass
##########################################################

        if current_section and current_section.strip() == "Individual Test Solutions":
            # print("Individual test solutions section found!")
            try:
                link_element = row.find_element(By.CSS_SELECTOR, "td.custom__table-heading__title > a")
                assessment_url = link_element.get_attribute("href")

                # Get adaptive support from the 2nd <td> (index 1)
                td_elements = row.find_elements(By.CLASS_NAME, "custom__table-heading__general")
                # print(f"first adaptive element: {td_elements}")
                adaptive_td = td_elements[1]

                # Default is No
                adaptive_support = "No"

                try:
                    # Check if a <span> with '-yes' exists inside the td
                    span = adaptive_td.find_element(By.TAG_NAME, "span")
                    adaptive_class = span.get_attribute("class")
                    if "-yes" in adaptive_class:
                        adaptive_support = "Yes"
                        # print(f"adaptive support: {adaptive_support}")
                except NoSuchElementException:
                    # No span means not adaptive, leave it as "No"
                    pass
                except Exception as e:
                    print(f"Unexpected error while checking adaptive: {e}")

                links_to_scrape.append((assessment_url, adaptive_support))
            except Exception as e:
                print("Error parsing row:", e)

##################################################
        # if current_section and current_section.strip() == "Individual Test Solutions":
        #     print("Individual text solutions section found!")
        #     try:
        #         link_element = row.find_element(By.CSS_SELECTOR, "td.custom__table-heading__title > a")
        #         assessment_url = link_element.get_attribute("href")
        #         # full_url = "https://www.shl.com" + assessment_url
        #         # print(f"full url: {full_}")

        #         td_elements = row.find_elements(By.CLASS_NAME, "custom__table-heading__general")
        #         adaptive_td = td_elements[1]
        #         try:
        #             # span = adaptive_td.find_element(By.CLASS_NAME, "catalogue__circle -yes")
        #             # adaptive_class = span.get_attribute("class")
        #             # adaptive_support = "Yes" if "-yes" in adaptive_class else "No"
        #             adaptive_td=adaptive_td.get_attribute("class")
        #         except Exception as e:
        #             print(f"An error occurred: {e}")
        #             adaptive_support = "No"

        #         # Save data for later use (avoid stale refs)
        #         links_to_scrape.append((assessment_url, adaptive_support))
        #     except Exception as e:
        #         print("Error parsing row:", e)

    # Now go scrape detail pages (after page structure is done)
    for url, adaptive_support in links_to_scrape:
        try:
            data = scrape_assessment(url, adaptive_support)
            assessments_data.append(data)
        except Exception as e:
            print(f"Error scraping assessment detail page: {url}\n{e}")

    return assessments_data

#Finding next page url    
def get_next_page_url(url):
    try:
        driver.get(url)
        print("Looking for next page link...")

        # Wait for the pagination section to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul.pagination"))
        )

        # Scroll to bottom so all dynamic content is rendered
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)  # short sleep for UI updates

        # Wait for ANY next button to appear
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li.pagination__item.-arrow.-next a"))
        )

        # Find all next page links
        next_buttons = driver.find_elements(By.CSS_SELECTOR, "li.pagination__item.-arrow.-next a")

        print(f"Found {len(next_buttons)} next button(s)")

        if next_buttons:
            # Choose last one to be safe (since usually it's duplicated top/bottom)
            next_href = next_buttons[-1].get_attribute("href")

            # Construct full link if necessary
            full_url = "https://www.shl.com" + next_href if next_href.startswith("/") else next_href
            print(f"Full next page URL: {full_url}")
            return full_url
        else:
            print("No next buttons found.")
            return None

    except Exception as e:
        print("❌ Error getting next page:", e)
        return None

# def get_next_page_url():

#     try:
#         print("trying to find next page link...")
#         WebDriverWait(driver, 15).until(
#             EC.presence_of_element_located((By.CSS_SELECTOR, "ul.pagination"))
#         )

#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(1)

#         WebDriverWait(driver, 10).until(
#             EC.presence_of_element_located((By.CSS_SELECTOR, "li.pagination__item.-arrow.-next"))
#         )

#         print("after wait")
#         next_buttons = driver.find_elements(By.XPATH, "//li[contains(@class, '-next') and not(contains(@class, 'disabled'))]/a")
#         if next_buttons:
#             print("Next buttons found")
#             if len(next_buttons) >= 2:
#                 next_href = next_buttons[1].get_attribute("href")
#                 full_url = "https://www.shl.com" + next_href if next_href.startswith("/") else next_href
#                 print(f"Full next link: {full_url}")
#                 return full_url
#             else:
#                 print("Only one next button found.")
            
#         else:
#             print("No next button found")
#             return None
#     except Exception as e:
#         print("Error getting next page:", e)
#         return None


# -------- Main loop --------
def main(url):
    # driver.get(url)
    all_data = []
    page_count = 1
    current_url=url

    while True:
        print(f"Scraping Page {page_count}")
        page_data = scrape_catalog_page(current_url)
        all_data.extend(page_data)
       
        # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.5)
        next_url = get_next_page_url(current_url)
        if not next_url:
            break

        current_url = next_url
    #     driver.get(current_url)
    #     WebDriverWait(driver, 10).until(
    #     EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody tr"))
    # )
        page_count += 1

    # Save results
    with open("4shl_individual_assessments.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, indent=2, ensure_ascii=False)

    print(f"Scraped {len(all_data)} individual assessments.")


# Run the script
if __name__ == "__main__":
    main(START_URL)
    driver.quit()


Scraping Page 1
Scraping a single assessment!
An error occurred: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//h4[contains(text(), 'Assessment length')]"}
  (Session info: chrome=135.0.7049.96); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6F7375355+78597]
	GetHandleVerifier [0x00007FF6F73753B0+78688]
	(No symbol) [0x00007FF6F71291AA]
	(No symbol) [0x00007FF6F717F149]
	(No symbol) [0x00007FF6F717F3FC]
	(No symbol) [0x00007FF6F71D2467]
	(No symbol) [0x00007FF6F71A712F]
	(No symbol) [0x00007FF6F71CF2BB]
	(No symbol) [0x00007FF6F71A6EC3]
	(No symbol) [0x00007FF6F71703F8]
	(No symbol) [0x00007FF6F7171163]
	GetHandleVerifier [0x00007FF6F761EF0D+2870973]
	GetHandleVerifier [0x00007FF6F76196B8+2848360]
	GetHandleVerifier [0x00007FF6F7636993+2967875]
	GetHandleVerifier [0x00007FF6F739019A+188746]
	GetHandleVerifier [0

In [1]:
import pprint
pprint.pprint([{'url': 'https://www.shl.com/solutions/products/product-catalog/view/opq-universal-competency-report/', 'adaptive_support': 'No', 'description': "This OPQ (Occupational Personality Questionnaire) report is based on the Universal Competency framework. It graphically outlines how an individual's typical way of behaving is likely to impact on competencies. It provides a graphical scale for each competency and summarises aspects of personality which contribute (positively or negatively) to each competency.\n\nPlease note this report is currently being updated on Talent Central as part of a reporting refresh project, and there may be visual differences when comparing Talent Central reports to ones generated on other assessment platforms. Differences are regarding the layout and display of the reports only and do not affect the underlying scoring. When viewing sample reports, please ensure you select the appropriate language and platform example.\n\nWhere a sample report has no platform indicated, it is available on both the Talent Central and SODA platforms.", 'duration': 'N/A', 'remote_support': 'Yes', 'test_type': ['Personality & Behavior']}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/opq-universal-competency-report-2-0/', 'adaptive_support': 'No', 'description': "This OPQ (Occupational Personality Questionnaire) report is based on the Universal Competency framework. It graphically outlines how an individual's typical way of behaving is likely to impact on competencies. It provides a graphical scale for each competency and summarises aspects of personality which contribute (positively or negatively) to each competency.", 'duration': 'N/A', 'remote_support': 'Yes', 'test_type': ['Personality & Behavior']}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/opq-ucf-development-action-planner-report/', 'adaptive_support': 'No', 'description': "This OPQ (Occupational Personality Questionnaire) report provides clear and relevant information about an individual's strengths and areas for development, for use in staff development and on-boarding situations.\n\nPlease note this report is currently being updated on Talent Central as part of a reporting refresh project, and there may be visual differences when comparing Talent Central reports to ones generated on other assessment platforms. Differences are regarding the layout and display of the reports only and do not affect the underlying scoring. When viewing sample reports, please ensure you select the appropriate language and platform example.\n\nWhere a sample report has no platform indicated, it is available on both the Talent Central and SODA platforms.", 'duration': 'N/A', 'remote_support': 'Yes', 'test_type': ['Personality & Behavior']}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/opq-manager-plus-report/', 'adaptive_support': 'No', 'description': 'This concise OPQ (Occupational Personality Questionnaire) report is designed for use with and by managers. It uses clear succinct bullets and tables for ease of interpretation. It provide simple comments on each of the personality traits.\n\nPlease note this report is currently being updated on Talent Central as part of a reporting refresh project, and there may be visual differences when comparing Talent Central reports to ones generated on other assessment platforms. Differences are regarding the layout and display of the reports only and do not affect the underlying scoring. When viewing sample reports, please ensure you select the appropriate language and platform example.\n\nWhere a sample report has no platform indicated, it is available on both the Talent Central and SODA platforms.', 'duration': 'N/A', 'remote_support': 'Yes', 'test_type': ['Personality & Behavior']}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/occupational-personality-questionnaire-opq32r/', 'adaptive_support': 'No', 'description': 'The SHL Occupational Personality Questionnaire, the OPQ32, is one of the most widely used and respected measures of workplace behavioural style in the world. It sets a high standard of measurement excellence, providing HR professionals and business managers with relevant and accurate information to make fast and well-informed people decisions. The OPQ32 provides a clear framework for understanding the impact of personality on job performance. It is internationally recognised for its accuracy of assessment. Over 90 independent validation studies have been conducted on the OPQ over a period of 25 years, across 20 countries and 40 industries, providing concrete evidence of its power to predict performance in the workplace.', 'duration': 25, 'remote_support': 'Yes', 'test_type': ['Personality & Behavior']}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/opq-candidate-report-2-0/', 'adaptive_support': 'No', 'description': 'A brief narrative OPQ (Occupational Personality Questionnaire) report structure around 3 OPQ profile sections which can be given to candidates to keep after a feedback session', 'duration': 'N/A', 'remote_support': 'Yes', 'test_type': ['Personality & Behavior']}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/opq-maximising-your-learning-report/', 'adaptive_support': 'No', 'description': 'This OPQ (Occupational Personality Questionnaire) report is designed to help people get the most from their development. It summarises the preferred approach to learning across four dimensions.', 'duration': 'N/A', 'remote_support': 'Yes', 'test_type': ['Personality & Behavior']}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/opq-ucf-development-action-planner-report-2-0/', 'adaptive_support': 'No', 'description': "This OPQ (Occupational Personality Questionnaire) report provides clear and relevant information about an individual's strengths and areas for development, for use in staff development and on-boarding situations.", 'duration': 'N/A', 'remote_support': 'Yes', 'test_type': ['Personality & Behavior']}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/ai-skills/', 'adaptive_support': 'No', 'description': 'The AI Skills assessment measures the skills that help candidates successfully leverage AI in their work.', 'duration': 16, 'remote_support': 'Yes', 'test_type': ['Personality & Behavior']}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/opq-leadership-report/', 'adaptive_support': 'No', 'description': "This OPQ (Occupational Personality Questionnaire) report provides a detailed analysis of an individual's leadership potential. It is based on SHL's leading edge Leadership Model, providing a competency based approach to leadership.", 'duration': 'N/A', 'remote_support': 'Yes', 'test_type': ['Personality & Behavior']}])

[{'adaptive_support': 'No',
  'description': 'This OPQ (Occupational Personality Questionnaire) report is '
                 'based on the Universal Competency framework. It graphically '
                 "outlines how an individual's typical way of behaving is "
                 'likely to impact on competencies. It provides a graphical '
                 'scale for each competency and summarises aspects of '
                 'personality which contribute (positively or negatively) to '
                 'each competency.\n'
                 '\n'
                 'Please note this report is currently being updated on Talent '
                 'Central as part of a reporting refresh project, and there '
                 'may be visual differences when comparing Talent Central '
                 'reports to ones generated on other assessment platforms. '
                 'Differences are regarding the layout and display of the '
                 'reports only and do not affect the underl