In [1]:
import requests
from PyPDF2 import PdfMerger
import os
from datetime import datetime
import concurrent.futures
import subprocess

def download_pdf(url, filename):
    response = requests.get(url)
    if response.status_code == 200 and len(response.content) > 0:
        with open(filename, 'wb') as f:
            f.write(response.content)
        return filename
    return None

def merge_pdfs(pdf_files, output_filename):
    merger = PdfMerger()
    for pdf in sorted(pdf_files, key=lambda x: int(x.split('_')[1].split('.')[0])):
        if pdf:
            merger.append(pdf)
    merger.write(output_filename)
    merger.close()

def compress_pdf(input_pdf, output_pdf):
    # Ghostscript command to compress PDF
    gs_command = [
        "gswin64c",  # Use "gswin64c" for 64-bit Windows, "gswin32c" for 32-bit Windows
        "-sDEVICE=pdfwrite",
        "-dCompatibilityLevel=1.3",
        "-dPDFSETTINGS=/printer",  # Use /screen for smallest file size, /ebook for better quality
        "-dNOPAUSE",
        "-dBATCH",
        "-dDownsampleColorImages=true",
        "-dColorImageResolution=180",
        f"-sOutputFile={output_pdf}",
        input_pdf
    ]
    try:
        subprocess.run(gs_command, check=True)
    except FileNotFoundError:
        print("Ghostscript not found. Please ensure Ghostscript is installed and added to your PATH.")
    except subprocess.CalledProcessError as e:
        print(f"Ghostscript error: {e}")

def download_and_merge_newspaper(date_str):
    base_url = "https://epaper.saamana.com/download.php?file=https://enewspapr.com/News/SAMANA/PUN/{year}/{month}/{day}/{date_str}_{page}.PDF&pageno={page}"
    
    year, month, day = date_str[:4], date_str[4:6], date_str[6:]
    formatted_url = base_url.format(year=year, month=month, day=day, date_str=date_str, page="{page}")

    pdf_files = []
    page = 1

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_to_page = {}
        while True:
            future = executor.submit(download_pdf, formatted_url.format(page=page), f"page_{page}.pdf")
            future_to_page[future] = page
            page += 1
            if len(future_to_page) >= 5:
                for future in concurrent.futures.as_completed(future_to_page):
                    page_num = future_to_page[future]
                    try:
                        filename = future.result()
                        if filename:
                            pdf_files.append(filename)
                        else:
                            break
                    except Exception as e:
                        print(f"Error downloading page {page_num}: {e}")
                future_to_page.clear()
                if not filename:
                    break

        # Ensure all remaining futures are processed
        for future in concurrent.futures.as_completed(future_to_page):
            page_num = future_to_page[future]
            try:
                filename = future.result()
                if filename:
                    pdf_files.append(filename)
            except Exception as e:
                print(f"Error downloading page {page_num}: {e}")

    if pdf_files:
        output_filename = f"SAMANA_PUN_{date_str}.pdf"
        merge_pdfs(pdf_files, output_filename)

        # Clean up individual page PDFs
        for pdf in pdf_files:
            os.remove(pdf)
        
        # Compress the merged PDF
        compressed_output_filename = f"COMPRESSED_SAMANA_PUN_{date_str}.pdf"
        compress_pdf(output_filename, compressed_output_filename)
        
        
        
        print(f"Downloaded, merged, and compressed {len(pdf_files)} pages into {compressed_output_filename}")
    else:
        print("No pages were downloaded. Please check the date and try again.")

if __name__ == "__main__":
    today_date = datetime.now().strftime("%Y%m%d")
    download_and_merge_newspaper(today_date)

Downloaded, merged, and compressed 10 pages into COMPRESSED_SAMANA_PUN_20241001.pdf
