In [3]:
import pandas as pd
import requests
import os

def download_pdfs_from_excel(excel_path, output_dir):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Read Excel into a DataFrame
    df = pd.read_excel(excel_path)
    
    for index, row in df.iterrows():
        doc_id = row['document_id']
        doc_title = row['document_title']
        source_url = row['source_url']
        
        # Create a filename based on document_id (or doc_title if you prefer)
        # Here, we'll just use the document_id:
        filename = f"{doc_id}.pdf"
        file_path = os.path.join(output_dir, filename)

        # Download the PDF
        try:
            response = requests.get(source_url, timeout=10)
            response.raise_for_status()  # This will raise an HTTPError if the request was not successful
            
            # Write the PDF content to a file
            with open(file_path, 'wb') as f:
                f.write(response.content)
            
            print(f"Downloaded: {file_path}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {source_url}: {e}")


download_pdfs_from_excel('pdf_test.xlsx', '/Users/arankhaira/Documents/pdf_store')

Failed to download https://www.bankofengland.co.uk/paper/2024/sop/the-bank-of-englands-approach-to-cost-benefit-analysis: 403 Client Error: Forbidden for url: https://www.bankofengland.co.uk/paper/2024/sop/the-bank-of-englands-approach-to-cost-benefit-analysis
Downloaded: /Users/arankhaira/Documents/pdf_store/2.pdf
Failed to download https://www.bankofengland.co.uk/prudential-regulation/publication/2024/december/pra-approach-to-cost-benefit-analysis-statement-of-policy: 403 Client Error: Forbidden for url: https://www.bankofengland.co.uk/prudential-regulation/publication/2024/december/pra-approach-to-cost-benefit-analysis-statement-of-policy
Downloaded: /Users/arankhaira/Documents/pdf_store/4.pdf
Downloaded: /Users/arankhaira/Documents/pdf_store/5.pdf
Downloaded: /Users/arankhaira/Documents/pdf_store/6.pdf


In [6]:
!pip show wkhtmltopdf 

Name: wkhtmltopdf
Version: 0.2
Summary: Simple python wrapper for wkhtmltopdf
Home-page: http://github.com/qoda/python-wkhtmltopdf
Author: Qoda
Author-email: jpbydendyk@gmail.com
License: BSD
Location: /Users/arankhaira/anaconda3/lib/python3.10/site-packages
Requires: 
Required-by: 


In [2]:
import pandas as pd
import requests
import os
import pdfkit

def download_pdfs_from_excel(excel_path, output_dir):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Read Excel into a DataFrame
    df = pd.read_excel(excel_path)
    
    for index, row in df.iterrows():
        doc_id = row['document_id']
        doc_title = row['document_title']
        source_url = row['source_url']
        
        filename = f"{doc_id}.pdf"
        file_path = os.path.join(output_dir, filename)

        try:
            response = requests.get(source_url, timeout=10)
            response.raise_for_status()
            
            # Check the content type
            content_type = response.headers.get('Content-Type', '').lower()
            
            if 'application/pdf' in content_type:
                # If it's a PDF, write the binary content directly
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded PDF: {file_path}")
            else:
                # Assume it's HTML and convert to PDF
                html_content = response.text
                # Convert HTML to PDF
                pdfkit.from_string(html_content, file_path)
                print(f"Converted webpage to PDF: {file_path}")
        
        except requests.exceptions.RequestException as e:
            print(f"Failed to process {source_url}: {e}")

download_pdfs_from_excel('pdf_test.xlsx', '/Users/arankhaira/Documents/pdf_store')

Failed to process https://www.bankofengland.co.uk/paper/2024/sop/the-bank-of-englands-approach-to-cost-benefit-analysis: 403 Client Error: Forbidden for url: https://www.bankofengland.co.uk/paper/2024/sop/the-bank-of-englands-approach-to-cost-benefit-analysis
Downloaded PDF: /Users/arankhaira/Documents/pdf_store/2.pdf
Failed to process https://www.bankofengland.co.uk/prudential-regulation/publication/2024/december/pra-approach-to-cost-benefit-analysis-statement-of-policy: 403 Client Error: Forbidden for url: https://www.bankofengland.co.uk/prudential-regulation/publication/2024/december/pra-approach-to-cost-benefit-analysis-statement-of-policy
Downloaded PDF: /Users/arankhaira/Documents/pdf_store/4.pdf


OSError: No wkhtmltopdf executable found: "b''"
If this file exists please check that this process can read it or you can pass path to it manually in method call, check README. Otherwise please install wkhtmltopdf - https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf

In [8]:
import pandas as pd
import requests
import os
import pdfkit

def download_pdfs_from_excel(excel_path, output_dir):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Read Excel into a DataFrame
    df = pd.read_excel(excel_path)
    
    # A common User-Agent for Chrome. You can try different ones if needed.
    headers = {
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/90.0.4430.93 Safari/537.36")
    }
    
    for index, row in df.iterrows():
        doc_id = row['document_id']
        doc_title = row['document_title']
        source_url = row['source_url']
        
        filename = f"{doc_id}.pdf"
        file_path = os.path.join(output_dir, filename)

        try:
            # Pass the headers to requests.get
            response = requests.get(source_url, timeout=10, headers=headers)
            response.raise_for_status()
            
            # Check the content type
            content_type = response.headers.get('Content-Type', '').lower()
            
            if 'application/pdf' in content_type:
                # If it's a PDF, write the binary content directly
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded PDF: {file_path}")
            else:
                # Assume it's HTML and convert to PDF
                html_content = response.text
                pdfkit.from_string(html_content, file_path)
                print(f"Converted webpage to PDF: {file_path}")
        
        except requests.exceptions.RequestException as e:
            print(f"Failed to process {source_url}: {e}")

download_pdfs_from_excel('pdf_test.xlsx', '/Users/arankhaira/Documents/pdf_store')

OSError: No wkhtmltopdf executable found: "b''"
If this file exists please check that this process can read it or you can pass path to it manually in method call, check README. Otherwise please install wkhtmltopdf - https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf

In [12]:
!pip install wkhtmltopdf 



In [2]:
import pandas as pd
df = pd.read_excel('pdf_test.xlsx')
df.head()

Unnamed: 0,document_id,document_title,source_url
0,1,Bank of england 1,https://www.bankofengland.co.uk/paper/2024/sop...
1,2,ESMA,https://www.esma.europa.eu/sites/default/files...
2,3,Bank of england,https://www.bankofengland.co.uk/prudential-reg...
3,4,ESMA,https://www.esma.europa.eu/sites/default/files...
