In [35]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import os

In [37]:
# URL base de la página de búsqueda
base_url = "https://www.sciencedirect.com/search"

# Parámetros de la búsqueda
params = {
    'qs': 'nuclear',
    'langs': 'en',
    'accessTypes': 'openaccess',
    'show': '25'
}

# Carpeta donde se guardarán los PDFs
pdf_folder = "pdfs"
os.makedirs(pdf_folder, exist_ok=True)

# Encabezados para hacer la solicitud parecer una solicitud del navegador
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

In [38]:
# Función para descargar un PDF desde una URL
def download_pdf(pdf_url, pdf_path):
    response = requests.get(pdf_url, headers=headers)
    if response.status_code == 200:
        with open(pdf_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"Downloaded: {pdf_path}")
    else:
        print(f"Failed to download: {pdf_url} with status code {response.status_code}")

# Realizar la solicitud a la página de búsqueda
response = requests.get(base_url, headers=headers, params=params)
if response.status_code != 200:
    print(f"Failed to retrieve search page with status code {response.status_code}")
else:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Encontrar todos los enlaces a los artículos en la página
    article_links = soup.find_all('a', class_='result-list-title-link')

    # Descargar cada PDF encontrado
    for link in article_links[:25]:  # Limitar a los primeros 25 enlaces
        article_url = urllib.parse.urljoin("https://www.sciencedirect.com", link['href'])
        
        # Extraer el pii del enlace del artículo
        pii = link['href'].split('/')[-1]
        
        # Construir la URL del PDF
        base_pdf_url = f"https://www.sciencedirect.com/science/article/pii/{pii}/pdfft"
        
        # Obtener la página del artículo para extraer el md5 y pid
        article_response = requests.get(article_url, headers=headers)
        if article_response.status_code != 200:
            print(f"Failed to retrieve article page: {article_url} with status code {article_response.status_code}")
            continue

        article_soup = BeautifulSoup(article_response.content, 'html.parser')
        pdf_link = article_soup.find('a', class_='anchor download-link anchor-default anchor-icon-left')
        
        if pdf_link:
            pdf_href = pdf_link['href']
            md5_pid = pdf_href.split('?')[1]
            full_pdf_url = f"{base_pdf_url}?{md5_pid}"
            
            pdf_name = f"{pii}.pdf"
            pdf_path = os.path.join(pdf_folder, pdf_name)
            
            download_pdf(full_pdf_url, pdf_path)
        else:
            print(f"No PDF link found for article: {article_url}")

print("All PDFs from the first page have been processed.")



  

Failed to retrieve search page with status code 400
All PDFs from the first page have been processed.
