In [None]:
'''
function 1 to scrape the whole page for all the assessment links
2 for scraping a single assessment and extracting its details like url
'''

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import json
import regex as re

CHROMEDRIVER_PATH = r"C:\Users\Lenovo\Desktop\My_Notebooks\SHL AI Research Intern Assessment\chromedriver-win64\chromedriver.exe"
START_URL = "https://www.shl.com/solutions/products/product-catalog/"

options = webdriver.ChromeOptions()
options.add_argument('--headless')     
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
service = Service(CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)

def scrape_assessment(url, adaptive_support):
    print("Scraping a single assessment!")
    driver.get(url)
    time.sleep(3)


    try:
        # print("trying for desciption")
        description_heading = driver.find_element(By.XPATH, "//h4[text()='Description']")
        description = description_heading.find_element(By.XPATH, "following-sibling::p[1]").text

    except Exception as e:
        print(f"An error occurred: {e}")
        description = "N/A"

    try:
        details_heading=driver.find_element(By.XPATH, "//h4[contains(text(), 'Assessment length')]")
        details_text = details_heading.find_element(By.XPATH, "following-sibling::p[1]").text
        match = re.search(r'\d+', details_text)
        duration = int(match.group()) if match else "N/A"
        
       
    except Exception as e:
        print(f"An error occurred: {e}")
        duration = "N/A"

    try:
        container = driver.find_elements(By.CSS_SELECTOR, "p.product-catalogue__small-text")

        remote_support = "N/A"
        test_type = []
        test_type_map = {
            "A": "Ability & Aptitude",
            "B": "Biodata & Situational Judgement",
            "C": "Competencies",
            "D": "Development & 360",
            "E": "Assessment Exercises",
            "K": "Knowledge & Skills",
            "P": "Personality & Behavior",
            "S": "Simulations"
        }

        for p in container:
            p_text = p.text.strip()

            # if p_text.startswith("Remote Testing:"):
            if "Remote Testing:" in p_text:
                # print("remote tesing section found")
                try:
                    span = p.find_element(By.TAG_NAME, "span")
                    span_class = span.get_attribute("class")
                    remote_support = "Yes" if "-yes" in span_class else "No"
                except Exception as e:
                    print(f"An error occurred: {e}")
                    remote_support = "N/A"

            # elif p_text.startswith("Test Type:"):
            elif "Test Type:" in p_text:
                # print("test type section found")
                try:
                    keys = p.find_elements(By.CLASS_NAME, "product-catalogue__key")
                    test_type_letters = [key.text.strip() for key in keys if key.text.strip()]
                    test_type = [test_type_map.get(letter, f"Unknown ({letter})") for letter in test_type_letters]
                except Exception as e:
                    print(f"An error occurred: {e}")
                    test_type = []
            # else:
            #     print("no remote or test type section found!")
    except Exception as e:
        print(f"An error occurred: {e}")
        remote_support = "N/A"
        test_type = []


    return {
        "url": url,
        "adaptive_support": adaptive_support,
        "description": description,
        "duration": duration,
        "remote_support": remote_support,
        "test_type": test_type
    }


def scrape_catalog_page(url):
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody tr"))
    )

    assessments_data = []
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    current_section = None
    links_to_scrape = []

    for row in rows:
        try:
            header = row.find_element(By.CLASS_NAME, "custom__table-heading__title").text
            if header in "Individual Test Solutions":
                current_section = header
                continue
        except Exception as e:
            print(f"An error occurred: {e}")
            pass

        if current_section and current_section.strip() == "Individual Test Solutions":
            # print("Individual test solutions section found!")
            try:
                link_element = row.find_element(By.CSS_SELECTOR, "td.custom__table-heading__title > a")
                assessment_url = link_element.get_attribute("href")

                td_elements = row.find_elements(By.CLASS_NAME, "custom__table-heading__general")
                # print(f"first adaptive element: {td_elements}")
                adaptive_td = td_elements[1]

                adaptive_support = "No"

                try:
                    span = adaptive_td.find_element(By.TAG_NAME, "span")
                    adaptive_class = span.get_attribute("class")
                    if "-yes" in adaptive_class:
                        adaptive_support = "Yes"
                        # print(f"adaptive support: {adaptive_support}")
                except NoSuchElementException:
                    pass
                except Exception as e:
                    print(f"Unexpected error while checking adaptive: {e}")

                links_to_scrape.append((assessment_url, adaptive_support))
            except Exception as e:
                print("Error parsing row:", e)



    for url, adaptive_support in links_to_scrape:
        try:
            data = scrape_assessment(url, adaptive_support)
            assessments_data.append(data)
        except Exception as e:
            print(f"Error scraping assessment detail page: {url}\n{e}")

    return assessments_data

def get_next_page_url(url):
    try:
        driver.get(url)
        print("Looking for next page link...")

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul.pagination"))
        )

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)  # short sleep for UI updates

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li.pagination__item.-arrow.-next a"))
        )

        next_buttons = driver.find_elements(By.CSS_SELECTOR, "li.pagination__item.-arrow.-next a")

        print(f"Found {len(next_buttons)} next button(s)")

        if next_buttons:
            next_href = next_buttons[-1].get_attribute("href")

            full_url = "https://www.shl.com" + next_href if next_href.startswith("/") else next_href
            print(f"Full next page URL: {full_url}")
            return full_url
        else:
            print("No next buttons found.")
            return None

    except Exception as e:
        print("Error getting next page:", e)
        return None




def main(url):
    # driver.get(url)
    all_data = []
    page_count = 1
    current_url=url

    while True:
        print(f"Scraping Page {page_count}")
        page_data = scrape_catalog_page(current_url)
        all_data.extend(page_data)
       
        # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.5)
        next_url = get_next_page_url(current_url)
        if not next_url:
            break

        current_url = next_url
    
        page_count += 1

    with open("4shl_individual_assessments.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, indent=2, ensure_ascii=False)

    print(f"Scraped {len(all_data)} individual assessments.")


if __name__ == "__main__":
    main(START_URL)
    driver.quit()
