In [202]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager

In [203]:
def init_driver():
    try:
        service = Service(ChromeDriverManager().install())
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36")
        driver = webdriver.Chrome(service=service, options=options)
        return driver
    except Exception as e:
        print(f"Error initializing the WebDriver: {e}")
        return None

In [204]:
def handle_popups(driver):
    try:
        # Handle cookie consent popup
        accept_cookies_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accept all cookies")]'))
        )
        print("Found 'Accept all cookies' button.")
        accept_cookies_button.click()

        time.sleep(2)  # Give some time for the popup to close

        # Try closing the organizational login popup
        close_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="Close"]'))
        )
        print("Found 'Close' button for organizational login.")
        close_button.click()

        # Ensure the popup is hidden
        time.sleep(2)
        popup_hidden = driver.execute_script("""
            return document.querySelector('div[role="dialog"]') === null;
        """)
        
        if not popup_hidden:
            print("Organizational login popup is still present.")
            driver.save_screenshot('popup_not_hidden.png')  # Save screenshot for debugging
            raise Exception("Failed to hide the organizational login popup.")
        else:
            print("Organizational login popup successfully hidden.")
    except Exception as e:
        print(f"Error handling popups: {e}")
        driver.save_screenshot('popup_error_screenshot.png')  # Save screenshot for debugging
        raise

In [205]:
def fetch_html(driver, url):
    retries = 3
    while retries > 0:
        try:
            driver.get(url)
            driver.save_screenshot('initial_page_load.png')  # Save screenshot for debugging initial page load
            handle_popups(driver)  # Handle any popups
            driver.save_screenshot('after_handling_popups.png')  # Save screenshot after handling popups
            
            # Scroll to the bottom to ensure all content is loaded
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Give some time for the content to load

            # Wait for all JavaScript to finish executing
            WebDriverWait(driver, 20).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'ResultItem'))
            )
            
            # Explicitly wait for the results to be visible
            WebDriverWait(driver, 20).until(
                EC.visibility_of_element_located((By.CLASS_NAME, 'ResultItem'))
            )
            
            time.sleep(5)  # Ensure the page is fully loaded
            return driver.page_source
        except Exception as e:
            print(f"Error fetching HTML content: {e}")
            driver.save_screenshot(f'error_screenshot_{3-retries+1}.png')  # Save screenshot for debugging
            retries -= 1
            print(f"Retrying... ({3-retries}/3)")
            time.sleep(5)  # Wait before retrying
    return None

In [206]:
def parse_articles(html):
    soup = BeautifulSoup(html, 'html.parser')
    articles = soup.find_all('li', class_='ResultItem')
    return articles

In [207]:
def extract_pdf_links(articles):
    pdf_links = []
    for article in articles:
        pdf_tag = article.find('a', class_='download-link')
        if pdf_tag and 'href' in pdf_tag.attrs:
            pdf_links.append('https://www.sciencedirect.com' + pdf_tag['href'])
    return pdf_links

In [208]:
def download_pdfs(pdf_links, download_dir='pdfs'):
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    for link in pdf_links:
        response = requests.get(link)
        response.raise_for_status()
        filename = link.split('/')[-1].split('?')[0]
        filepath = os.path.join(download_dir, filename)
        with open(filepath, 'wb') as f:
            f.write(response.content)
        print(f'Downloaded: {filepath}')

In [209]:
def main():
    driver = init_driver()
    if driver is None:
        print("WebDriver initialization failed. Exiting.")
        return
    
    base_url = "https://www.sciencedirect.com/search?qs=nuclear&langs=en&lastSelectedFacet=accessTypes&accessTypes=openaccess&show=25&offset={}"
    page_offset = 0
    try:
        while True:
            url = base_url.format(page_offset)
            print(f"Fetching URL: {url}")
            html = fetch_html(driver, url)
            if html is None:
                print("Failed to fetch HTML content. Exiting.")
                break
            articles = parse_articles(html)
            if not articles:
                break
            pdf_links = extract_pdf_links(articles)
            download_pdfs(pdf_links)
            page_offset += 25
            time.sleep(2)  # Adding a delay to be respectful to the server
    finally:
        if driver:
            driver.quit()

if __name__ == '__main__':
    main()

Fetching URL: https://www.sciencedirect.com/search?qs=nuclear&langs=en&lastSelectedFacet=accessTypes&accessTypes=openaccess&show=25&offset=0
Found 'Accept all cookies' button.
Error handling popups: Message: 
Stacktrace:
	GetHandleVerifier [0x00CDB8E3+45827]
	(No symbol) [0x00C6DCC4]
	(No symbol) [0x00B6150F]
	(No symbol) [0x00BA20BC]
	(No symbol) [0x00BA216B]
	(No symbol) [0x00BDE0F2]
	(No symbol) [0x00BC2E44]
	(No symbol) [0x00BDC034]
	(No symbol) [0x00BC2B96]
	(No symbol) [0x00B96998]
	(No symbol) [0x00B9751D]
	GetHandleVerifier [0x00F94513+2899763]
	GetHandleVerifier [0x00FE793D+3240797]
	GetHandleVerifier [0x00D613B4+593364]
	GetHandleVerifier [0x00D682DC+621820]
	(No symbol) [0x00C770A4]
	(No symbol) [0x00C737A8]
	(No symbol) [0x00C73947]
	(No symbol) [0x00C659FE]
	BaseThreadInitThunk [0x768E7BA9+25]
	RtlInitializeExceptionChain [0x777DC10B+107]
	RtlClearBits [0x777DC08F+191]

Error fetching HTML content: Message: 
Stacktrace:
	GetHandleVerifier [0x00CDB8E3+45827]
	(No symbol) [0