In [4]:
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import urllib.parse

In [5]:
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [6]:
# Headers to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [7]:
def sanitize_filename(filename):
    # Remove characters that are invalid in filenames
    return "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_')).rstrip()

In [8]:
def download_file(url, file_name):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            with open(file_name, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded {file_name}")
        else:
            print(f"Failed to download file from {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading {file_name} from {url}. Error: {e}")

In [9]:
def download_pdfs(url, download_folder):
    driver.get(url)
    time.sleep(5)  # Wait for the page to load completely

    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    # Find all the 'Get PDF' links using the aria-label attribute that contains 'Link to PDF'
    pdf_links = driver.find_elements(By.XPATH, "//a[contains(@aria-label, 'Link to PDF')]")

    for link in pdf_links:
        pdf_url = link.get_attribute('href')
        if pdf_url:
            parsed_url = urllib.parse.urlparse(pdf_url)
            sanitized_filename = sanitize_filename(os.path.basename(parsed_url.path))
            if not sanitized_filename:
                sanitized_filename = "downloaded_file.pdf"
            pdf_name = os.path.join(download_folder, sanitized_filename)
            print(f"Downloading {pdf_url}")
            download_file(pdf_url, pdf_name)

In [10]:
def handle_pagination(base_url, total_pages, download_folder):
    for page in range(total_pages):
        offset = page * 10
        url = f"{base_url}&offset={offset}"
        print(f"Processing page {page + 1}...")
        download_pdfs(url, download_folder)

In [11]:
# Main function
def main():
    base_url = 'https://mit.primo.exlibrisgroup.com/discovery/search?query=any,contains,nuclear&tab=all&search_scope=all&vid=01MIT_INST:MIT&facet=tlevel,include,open_access&lang=en&mode=advanced'
    total_pages = 5  # Adjust the number of pages as necessary
    download_folder = 'pdfs'

    # Handle pagination and download PDFs from all pages
    handle_pagination(base_url, total_pages, download_folder)

    driver.quit()

if __name__ == "__main__":
    main()

Processing page 1...
Downloading https://www.aanda.org/articles/aa/pdf/2013/01/aa20537-12.pdf
Downloaded pdfs\aa2053712.pdf
Processing page 2...
Downloading https://link.aps.org/accepted/10.1103/RevModPhys.88.035004
Downloaded pdfs\RevModPhys.88.035004
Downloading https://europepmc.org/articles/pmc6601390?pdf=render
Downloaded pdfs\pmc6601390
Processing page 3...
Downloading https://www.cambridge.org/core/services/aop-cambridge-core/content/view/7AF6E4965361B7E65EC536301C4B45CD/9781009401876AR.pdf/Nuclear_Superfluidity.pdf?event-type=FTLA
Downloaded pdfs\Nuclear_Superfluidity.pdf
Downloading https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/bjd.18098
Failed to download file from https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/bjd.18098. Status code: 403
Processing page 4...
Downloading https://link.aps.org/accepted/10.1103/PhysRevC.95.055804
Downloaded pdfs\PhysRevC.95.055804
Downloading https://pubs.aip.org/aip/jap/article-pdf/95/11/5949/10632063/5949_1_online.pdf
Failed to 