In [2]:
!pip install requests beautifulsoup4



In [7]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import os
import time
import logging
import random

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
]

def search_google_scholar(query, num_results=10):
    base_url = "https://scholar.google.com/scholar"
    params = {
        'q': query,
        'hl': 'en',
        'as_sdt': '0,5',  # This parameter includes "All" and "Reviews" in the search
        'as_vis': '1',    # Include citations
        'num': num_results
    }
    headers = {'User-Agent': random.choice(USER_AGENTS)}

    try:
        response = requests.get(base_url, params=params, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        results = []
        for div in soup.find_all('div', class_='gs_r gs_or gs_scl'):
            for link in div.find_all('a'):
                href = link.get('href')
                if href and (href.lower().endswith('.pdf') or '[PDF]' in link.text):
                    results.append(href)
                    break  # Only take the first PDF link from each result
            if len(results) >= num_results:
                break

        logging.info(f"Found {len(results)} PDF results for query: {query}")
        return results
    except requests.RequestException as e:
        logging.error(f"Error during search: {str(e)}")
        return []

def download_pdf(url, folder):
    try:
        headers = {'User-Agent': random.choice(USER_AGENTS)}
        response = requests.get(url, headers=headers, stream=True, timeout=10)
        response.raise_for_status()

        # Try to get the filename from the Content-Disposition header
        content_disposition = response.headers.get('Content-Disposition')
        if content_disposition:
            filename = content_disposition.split('filename=')[1].strip('"')
        else:
            filename = url.split('/')[-1]

        if not filename.lower().endswith('.pdf'):
            filename += '.pdf'

        filepath = os.path.join(folder, filename)
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        logging.info(f"Downloaded: {filepath}")
        return True
    except requests.RequestException as e:
        logging.error(f"Failed to download {url}: {str(e)}")
    except Exception as e:
        logging.error(f"Unexpected error downloading {url}: {str(e)}")
    return False

def main():
    keywords = input("Enter search keywords for your research: ")
    num_results = int(input("Enter the number of PDFs to search for: "))
    download_folder = "research_pdfs"

    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    pdf_urls = search_google_scholar(keywords, num_results)

    if not pdf_urls:
        print(f"No PDF results found for the query: {keywords}")
        return

    successful_downloads = 0
    for url in pdf_urls:
        if download_pdf(url, download_folder):
            successful_downloads += 1
        time.sleep(2 + random.random() * 3)  # Be polite, wait between downloads

    print(f"\nDownload complete. {successful_downloads} out of {len(pdf_urls)} PDFs downloaded to {download_folder}")

    if successful_downloads == 0:
        print("Troubleshooting tips:")
        print("1. Check your internet connection.")
        print("2. Try different keywords or increase the number of results.")
        print("3. Some PDFs might be inaccessible due to permissions or paywalls.")
        print("4. Check the log for more detailed error messages.")

if __name__ == "__main__":
    main()

Enter search keywords for your research: gilgit
Enter the number of PDFs to search for: 5

Download complete. 5 out of 5 PDFs downloaded to research_pdfs
