### Import Libraries

In [16]:
import os
import re
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException

### Setup and Configuration

In [17]:
download_folder = "downloaded_pdfs"
os.makedirs(download_folder, exist_ok=True)

# Track downloaded PDF URLs to avoid duplicates
downloaded_pdfs = set()

# Setup Chrome options (headless for speed)
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(options=chrome_options)
driver.set_page_load_timeout(100) 

# Load the target URL
driver.get("https://pi-pil-repository.sahpra.org.za/")

# Use a persistent requests session for downloads
session = requests.Session()

### Helper Functions

In [18]:
def wait_for_page_load(driver, timeout=60):
    """Wait until the document is fully loaded."""
    try:
        WebDriverWait(driver, timeout).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
    except TimeoutException:
        print("⚠️ Timeout waiting for the page to fully load.")

def wait_for_pdf_links(driver, timeout=60):
    """
    Wait for at least one PDF link (<a> ending with '.pdf') to appear in the table.
    Returns the list of elements once found.
    """
    end_time = time.time() + timeout
    while time.time() < end_time:
        pdf_elements = driver.find_elements(By.CSS_SELECTOR, "table tr td a[href$='.pdf']")
        if pdf_elements:
            return pdf_elements
        time.sleep(1)
    raise TimeoutException("Timeout waiting for PDF links.")

def sanitize_filename(text):
    """
    Improved sanitization:
      - Trims whitespace.
      - Replaces all whitespace with a single underscore.
      - Removes characters other than alphanumeric, underscore, or hyphen.
      - Replaces multiple underscores with a single underscore.
    """
    text = text.strip()
    text = re.sub(r'\s+', '_', text)
    text = re.sub(r'[^\w\-]', '', text)
    text = re.sub(r'_+', '_', text)
    return text

def determine_file_type(link):
    """
    Determine if the PDF link is for PI or PIL.
    First, check the link's visible text.
    If inconclusive, inspect the href (filename) for common substrings.
    """
    link_text = link.text.strip().lower()
    href = link.get_attribute("href").lower()
    
    # Check the link's text first.
    if "pil" in link_text:
        return "pil"
    elif "pi" in link_text:
        return "pi"
    
    # Fallback: check the filename in the URL.
    filename = href.split("/")[-1]
    if "epil" in filename or "-pil" in filename:
        return "pil"
    elif "epi" in filename or "-pi" in filename:
        return "pi"
    
    # Last resort:
    if "pil" in href:
        return "pil"
    elif "pi" in href:
        return "pi"
    
    return "unknown"

def download_pdf(pdf_url, pdf_filename):
    """Download a PDF from the given URL to the specified filename."""
    if pdf_url in downloaded_pdfs:
        return None
    downloaded_pdfs.add(pdf_url)
    try:
        response = session.get(pdf_url, stream=True, timeout=60)
        response.raise_for_status()
        with open(pdf_filename, "wb") as pdf_file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    pdf_file.write(chunk)
        return pdf_filename
    except Exception as e:
        print(f"Error downloading {pdf_url}: {e}")
        return None

### Main Loop: Iterate Through Pages

In [19]:
current_page = 1
max_retries = 2  # Number of refresh attempts per page

with ThreadPoolExecutor(max_workers=5) as executor:
    while True:
        print(f"\n📄 Processing Page {current_page}...")

        # Ensure the page is fully loaded before proceeding
        wait_for_page_load(driver, timeout=60)

        # Try to wait for PDF links with a retry mechanism
        retries = 0
        while retries < max_retries:
            try:
                _ = wait_for_pdf_links(driver, timeout=60)
                break  # Found PDF links—exit the retry loop
            except TimeoutException:
                retries += 1
                print(f"⚠️ Timeout waiting for PDF links on page {current_page}. Refreshing (attempt {retries})...")
                driver.refresh()
                wait_for_page_load(driver, timeout=60)
        else:
            print(f"⚠️ Failed to load PDF links on page {current_page} after {max_retries} attempts. Skipping page...")
            current_page += 1
            continue

        # -------------------------------
        # Process Each Table Row for Product Name and PDF links
        # -------------------------------
        pdf_info = []  # List of tuples: (pdf_url, desired_file_path)
        # Get all rows once; then process them by index so we can re-fetch individual rows if needed.
        rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
        for i in range(len(rows)):
            for attempt in range(3):
                try:
                    # Re-fetch the row by index on each attempt
                    row = driver.find_elements(By.CSS_SELECTOR, "table tr")[i]
                    try:
                        product_name = row.find_element(By.CSS_SELECTOR, "td:nth-child(1)").text.strip()
                        if not product_name:
                            product_name = "unknown"
                    except Exception:
                        product_name = "unknown"
                    product_name = sanitize_filename(product_name)
                    
                    # Get all PDF links in this row
                    pdf_links = row.find_elements(By.CSS_SELECTOR, "td a[href$='.pdf']")
                    
                    if len(pdf_links) == 2:
                        # If exactly two PDF links exist, assume the first is PI and the second is PIL.
                        pi_url = pdf_links[0].get_attribute("href")
                        pil_url = pdf_links[1].get_attribute("href")
                        file_name_pi = f"{product_name}-pi.pdf"
                        file_name_pil = f"{product_name}-pil.pdf"
                        full_file_path_pi = os.path.join(download_folder, file_name_pi)
                        full_file_path_pil = os.path.join(download_folder, file_name_pil)
                        pdf_info.append((pi_url, full_file_path_pi))
                        pdf_info.append((pil_url, full_file_path_pil))
                    else:
                        # Otherwise, determine the file type for each link.
                        for link in pdf_links:
                            try:
                                pdf_url = link.get_attribute("href")
                                if not pdf_url or pdf_url in downloaded_pdfs:
                                    continue

                                file_type = determine_file_type(link)
                                if file_type not in ["pi", "pil"]:
                                    print(f"⚠️ Unable to determine file type for URL: {pdf_url}")
                                    continue

                                file_name = f"{product_name}-{file_type}.pdf"
                                full_file_path = os.path.join(download_folder, file_name)
                                pdf_info.append((pdf_url, full_file_path))
                            except StaleElementReferenceException:
                                # If a link becomes stale, let the outer loop re-fetch the row.
                                raise
                    break  # Successfully processed this row; exit the retry loop.
                except StaleElementReferenceException:
                    if attempt < 2:
                        print(f"⚠️ Stale element caught for row {i}. Retrying (attempt {attempt + 1})...")
                        time.sleep(1)
                    else:
                        print(f"⚠️ Skipping row {i} after repeated stale element errors.")


        # -------------------------------
        # Download PDFs concurrently
        # -------------------------------
        if pdf_info:
            futures = {
                executor.submit(download_pdf, url, filename): (url, filename)
                for (url, filename) in pdf_info
            }
            for future in as_completed(futures):
                result = future.result()
                if result:
                    print(f"Downloaded: {result}")


        # -------------------------------
        # Navigate to the Next Page
        # -------------------------------
        try:
            pagination = driver.find_element(By.CSS_SELECTOR, "ul.pagination")
            next_button = pagination.find_element(By.XPATH, ".//li/a[contains(text(),'›')]")
            # Check if the next button is disabled (e.g., via a 'disabled' class)
            if "disabled" in next_button.get_attribute("class"):
                print("No more pages to process.")
                break

            # Click the next button using JavaScript
            driver.execute_script("arguments[0].click();", next_button)
            wait_for_page_load(driver, timeout=60)
            time.sleep(2)  # Give a moment for asynchronous content to load
            current_page += 1

        except (NoSuchElementException, TimeoutException) as e:
            print("'Next' button not found or page did not load in time, stopping...")
            break

driver.quit()
