In [59]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [65]:
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [66]:
def download_file(url, file_name):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(file_name, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded {file_name}")
        else:
            print(f"Failed to download file from {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading {file_name} from {url}. Error: {e}")

In [67]:
def download_pdfs(url, download_folder):
    driver.get(url)
    time.sleep(5)  # Wait for the page to load completely

    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    # Find all the 'Get PDF' links using the text inside the span element
    pdf_links = driver.find_elements(By.XPATH, "//a[contains(@aria-label, 'Link to PDF')]")

    for link in pdf_links:
        pdf_url = link.get_attribute('href')
        if pdf_url:
            pdf_name = os.path.join(download_folder, os.path.basename(pdf_url))
            print(f"Downloading {pdf_url}")
            download_file(pdf_url, pdf_name)

In [68]:
def handle_pagination(base_url, total_pages, download_folder):
    for page in range(total_pages):
        offset = page * 10
        url = f"{base_url}&offset={offset}"
        print(f"Processing page {page + 1}...")
        download_pdfs(url, download_folder)

In [69]:
# Main function
def main():
    base_url = 'https://mit.primo.exlibrisgroup.com/discovery/search?query=any,contains,nuclear&tab=all&search_scope=all&vid=01MIT_INST:MIT&facet=tlevel,include,open_access&lang=en&mode=advanced'
    total_pages = 5  # Adjust the number of pages as necessary
    download_folder = 'pdfs'

    # Handle pagination and download PDFs from all pages
    handle_pagination(base_url, total_pages, download_folder)

    driver.quit()

if __name__ == "__main__":
    main()

Processing page 1...
Downloading https://www.aanda.org/articles/aa/pdf/2013/01/aa20537-12.pdf
Failed to download file from https://www.aanda.org/articles/aa/pdf/2013/01/aa20537-12.pdf. Status code: 403
Processing page 2...
Downloading https://link.aps.org/accepted/10.1103/RevModPhys.88.035004
Downloaded pdfs\RevModPhys.88.035004
Downloading https://europepmc.org/articles/pmc6601390?pdf=render
Error downloading pdfs\pmc6601390?pdf=render from https://europepmc.org/articles/pmc6601390?pdf=render. Error: [Errno 22] Invalid argument: 'pdfs\\pmc6601390?pdf=render'
Processing page 3...
Downloading https://www.cambridge.org/core/services/aop-cambridge-core/content/view/7AF6E4965361B7E65EC536301C4B45CD/9781009401876AR.pdf/Nuclear_Superfluidity.pdf?event-type=FTLA
Error downloading pdfs\Nuclear_Superfluidity.pdf?event-type=FTLA from https://www.cambridge.org/core/services/aop-cambridge-core/content/view/7AF6E4965361B7E65EC536301C4B45CD/9781009401876AR.pdf/Nuclear_Superfluidity.pdf?event-type=FT