In [13]:
from pathlib import Path
import json
import logging

from selectolax.parser import HTMLParser
import httpx

logging.getLogger("httpx").setLevel(logging.CRITICAL)

out_dir = Path("../working")
out_dir.mkdir(exist_ok=True)


def get_show_page(url):
    show_name = url.split("/")[5]
    podcast_id = url.split("/")[6].split("?")[0]
    filename = f"{show_name}-{podcast_id}.json"
    out_path = out_dir / filename
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 15.4.1) AppleWebKit/537.36 (KHTML, like Gecko) Safari/18.4"
    }

    with httpx.Client(headers=headers, timeout=10.0, follow_redirects=True) as client:
        try:
            response = client.get(url)
            response.raise_for_status()
            html = response.text
            tree = HTMLParser(html)
            script = next(
                (
                    s
                    for s in tree.css("script")
                    if s.attributes.get("id") == "serialized-server-data"
                ),
                None,
            )
            if not script:
                return

            try:
                raw_json = json.loads(script.text())
                with open(out_path, "w", encoding="utf-8") as f:
                    json.dump(raw_json, f)
            except json.JSONDecodeError:
                return

            items = raw_json[0].get("data", {}).get("shelves", [])

            metadata_items = next(
                (
                    shelf
                    for shelf in items
                    if shelf.get("contentType") == "showHeaderRegular"
                ),
                {},
            ).get("items", [])
            metadata = metadata_items[0].get("metadata")
            metadata_dict = {k: v for d in metadata for k, v in d.items()}
            subcategory = metadata_dict.get("category")
            update_frequency = metadata_dict.get("updateFrequency")
            ratings = metadata_dict.get("ratings")
            ratingAverage = ratings.get("ratingAverage")
            totalNumberOfRatings = ratings.get("totalNumberOfRatings")
            if not isinstance(subcategory, str):
                subcategory = subcategory.get("title")
            print(
                (
                    subcategory,
                    update_frequency,
                    ratingAverage,
                    totalNumberOfRatings,
                    podcast_id,
                )
            )

        except httpx.HTTPError as error:
            print(str(error))


url = "https://podcasts.apple.com/us/podcast/the-unspeakable-podcast/id1524832743"
get_show_page(url)

('Society & Culture', 'Updated Weekly', 4.7, 777, 'id1524832743')
