In [None]:
import os
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pdfkit

BASE_URL = "https://wol.jw.org"
START_URL = "https://wol.jw.org/da/wol/library/r9/lp-d/alle-publikationer/b%C3%B8ger/hele-skriften-si"

def get_chapter_links(start_url):
    response = requests.get(start_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for a in soup.select('a[href^="/da/wol/d/r9/lp-d/"]'):
        href = a['href']
        if href not in links:
            links.append(href)
    return links

def get_article_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    article = soup.find('article')
    if article:
        # Tilføj kapiteloverskrift som H3
        title_tag = soup.find('h1')
        title = title_tag.get_text(strip=True) if title_tag else ''
        return f"<h3>{title}</h3>\n{str(article)}"
    return ''

def save_pdf(html_content, output_path):
    pdfkit.from_string(html_content, output_path)

def main():
    print("Henter kapitellinks...")
    chapter_links = get_chapter_links(START_URL)
    print(f"Fundet {len(chapter_links)} kapitler.")

    all_content = ''
    for link in tqdm(chapter_links, desc="Henter kapitler"):
        full_url = BASE_URL + link
        content = get_article_content(full_url)
        all_content += content + '<div style="page-break-after: always;"></div>'

    output_dir = "pdf_output"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "Hele_Skriften_er_inspireret_af_Gud_og_gavnlig.pdf")
    print("Gemmer PDF...")
    save_pdf(all_content, output_path)
    print(f"PDF gemt som {output_path}")

if __name__ == "__main__":
    main()

Henter kapitellinks...
Fundet 82 kapitler.


Henter kapitler:  33%|███▎      | 27/82 [04:35<09:41, 10.57s/it]