In [8]:
import json
import sqlite3
from pathlib import Path

from selectolax.parser import HTMLParser

shows_dir = Path("../data/shows")


def get_podcast_stats_by_name(partial_name):
    files_paths = []
    conn = sqlite3.connect("../data/podcasts.db")
    cursor = conn.cursor()

    query = """
        SELECT
            p.name,
            p.show_id,
            p.category,
            r.country,
            r.rank,
            s.rank_score
        FROM podcast p
        LEFT JOIN ranking r ON p.id = r.podcast_id
        LEFT JOIN score s ON p.id = s.podcast_id
        WHERE p.name LIKE ?
        ORDER BY r.chart_date DESC, s.date DESC;
    """

    try:
        search_term = f"%{partial_name}%"
        cursor.execute(query, (search_term,))
        rows = cursor.fetchall()

        if not rows:
            print("No podcast found matching:", partial_name)
            return

        podcast_data = {}
        for name, show_id, category, country, rank, rank_score in rows:
            if name not in podcast_data:
                file_path = f"{shows_dir / category}/{show_id}.html"
                files_paths.append(file_path)
                podcast_data[name] = {
                    "show_id": show_id,
                    "file": file_path,
                    "category": category,
                    "rankings": {},
                    "latest_score": None,
                }

            if country and rank is not None:
                if country not in podcast_data[name]["rankings"]:
                    podcast_data[name]["rankings"][country] = rank

            if rank_score is not None and podcast_data[name]["latest_score"] is None:
                podcast_data[name]["latest_score"] = rank_score

        for name, data in podcast_data.items():
            print(f"\nPodcast: {name}")
            print(f"Show ID: {data['show_id']}")
            print(f"Category: {data['category']}")
            print("Country Rankings:")
            for country, rank in data["rankings"].items():
                print(f"  {country}: #{rank}")
            print(f"Overall Score: {data['latest_score']}")
            print(f"File: {data['file']}")

    finally:
        conn.close()
        return files_paths


def find_and_copy_to_working_folder(search_term):
    file_paths = get_podcast_stats_by_name(search_term)
    out_dir = Path("../working")
    out_dir.mkdir(exist_ok=True)
    if file_paths:
        file_path = Path(file_paths[0])
        out_path_html = out_dir / f"{file_path.stem}.html"
        out_path_json = out_dir / f"{file_path.stem}.json"
        with open(file_path, "r", encoding="utf-8") as f:
            html = f.read()
            f.close()
        clean_html = html.replace("<p ", "<div ").replace("</p>", "</div>")
        with open(out_path_html, "w", encoding="utf-8") as f:
            f.write(clean_html)
            f.close()
        tree = HTMLParser(html)
        script = next(
            (
                s
                for s in tree.css("script")
                if s.attributes.get("id") == "serialized-server-data"
            ),
            None,
        )
        raw_json = json.loads(script.text())
        items = raw_json[0].get("data", {}).get("shelves", [])
        metadata_items = next(
            (
                shelf
                for shelf in items
                if shelf.get("contentType") == "showHeaderRegular"
            ),
            {},
        ).get("items", [])
        metadata = metadata_items[0].get("metadata")
        metadata_dict = {k: v for d in metadata for k, v in d.items()}
        subcategory = metadata_dict.get("category")
        update_frequency = metadata_dict.get("updateFrequency")
        ratings = metadata_dict.get("ratings")
        ratingAverage = ratings.get("ratingAverage")
        totalNumberOfRatings = ratings.get("totalNumberOfRatings")
        if not isinstance(subcategory, str):
            subcategory = subcategory.get("title")
        print("Subcategory:", subcategory)
        print(update_frequency)
        print(ratingAverage, "stars")
        print(totalNumberOfRatings, "reviews")

        with open(out_path_json, "w", encoding="utf-8") as f:
            f.write(json.dumps(items))
            f.close()


find_and_copy_to_working_folder("The Duran")


Podcast: The Duran Podcast
Show ID: 1442883993
Category: news
Country Rankings:
  /home/ansel/dev/okapi-rank/data/charts/gb: #164
  /home/ansel/dev/okapi-rank/data/charts/au: #117
  /home/ansel/dev/okapi-rank/data/charts/ie: #188
  /home/ansel/dev/okapi-rank/data/charts/nz: #146
Overall Score: 4.2138
File: ../data/shows/news/1442883993.html
Subcategory: News
Updated Daily
4.5 stars
90 reviews
