In [None]:
# |default_exp rss

# RSS variable

In [None]:
# |export

from dotenv import load_dotenv, find_dotenv
import os


def load_env():
    _ = load_dotenv(find_dotenv())


def get_RSS_URL():
    load_env()
    RSS_LMELP_URL = os.getenv("RSS_LMELP_URL")
    if RSS_LMELP_URL is None:
        RSS_LMELP_URL = "https://radiofrance-podcast.net/podcast09/rss_14007.xml"
    return RSS_LMELP_URL

In [None]:
get_RSS_URL()

'https://radiofrance-podcast.net/podcast09/rss_14007.xml'

# extraire_urls

In [None]:
# |export

import feedparser
import re


def extraire_dureesummary(summary):
    """Extrait la durée d un episode du masque.
    Returns:
    Le nombre de secondes correspondant à la durée d'un episode.
    -1 si la durée n'est pas trouvée.
    """

    # Expression régulière pour extraire la durée
    pattern_duree = r"durée\s*:\s*(\d{2}:\d{2}:\d{2})"

    # Recherche de la durée dans le texte
    match = re.search(pattern_duree, summary)

    if match:
        duree_str = match.group(1)
        heures, minutes, secondes = map(int, duree_str.split(":"))
        return heures * 3600 + minutes * 60 + secondes
    else:
        return -1


def extraire_urls_rss(duree_mini_minutes=15):
    """Extrait les URLs des balises `enclosure` d'un flux RSS des episodes durant plus de duree_mini_minutes minutes

    Args:
      duree_mini_minutes: la duree mini en minutes des episodes du flux

    Returns:
      Une liste d'URLs.
    """

    url_flux = get_RSS_URL()

    flux = feedparser.parse(url_flux)
    urls = []
    for entree in flux.entries:
        for link in entree.links:
            if link.type == "audio/mpeg":
                if (
                    extraire_dureesummary(entree.summary) > duree_mini_minutes * 60
                ):  # 15 minutes
                    urls.append(link.href)
    return urls

In [None]:
urls = extraire_urls_rss()
urls

['https://rf.proxycast.org/ad97aa2e-ebfc-4d00-8739-4ca72192e726/14007-12.01.2025-ITEMA_23993269-2025F4007S0012-22.mp3',
 'https://rf.proxycast.org/a3082fcd-8ed2-49eb-883c-29b32a1f7b2b/14007-05.01.2025-ITEMA_23985183-2025F4007S0005-22.mp3',
 'https://rf.proxycast.org/a9ea02c4-09a0-4a95-a86b-269c571baf8a/14007-29.12.2024-ITEMA_23978947-2024F4007S0364-22.mp3',
 'https://rf.proxycast.org/7e653bf4-87a5-42f4-864b-9208e206a295/14007-22.12.2024-ITEMA_23973143-2024F4007S0357-22.mp3',
 'https://rf.proxycast.org/7f3818f7-cc2e-44f5-a5f0-69dde19127b7/14007-15.12.2024-ITEMA_23965603-2024F4007S0350-22.mp3',
 'https://rf.proxycast.org/aeffdd33-8ec9-43f0-8168-09cd6d2ed539/14007-08.12.2024-ITEMA_23954487-2024F4007S0343-22.mp3',
 'https://rf.proxycast.org/7502dbc2-c937-4a70-b60d-c5a89f5b5da4/14007-01.12.2024-ITEMA_23942372-2024F4007S0336-22.mp3',
 'https://rf.proxycast.org/92889ad9-ba53-4510-9b2d-659d5f62b5e9/14007-24.11.2024-ITEMA_23935099-2024F4007S0329-22.mp3',
 'https://rf.proxycast.org/d0d34b20-d05d

# podcast

In [None]:
# |export

import feedparser
from mongo import get_collection, get_DB_VARS
from datetime import datetime
from typing import List
from feedparser.util import FeedParserDict
from mongo_episode import RSS_episode
import pytz

RSS_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S %z"  # "Sun, 29 Dec 2024 10:59:39 +0100"


class Podcast:
    def __init__(self):
        self.parsed_flow = feedparser.parse(get_RSS_URL())
        DB_HOST, DB_NAME, _ = get_DB_VARS()
        self.collection = get_collection(
            target_db=DB_HOST, client_name=DB_NAME, collection_name="episodes"
        )

    def get_most_recent_episode_from_DB(self) -> datetime:
        """
        on recupere la date la plus recente des episodes stockes
        """
        most_recent_document = self.collection.find().sort({"date": -1}).limit(1)
        # Extraire la date du document
        most_recent_date = None
        for doc in most_recent_document:
            most_recent_date = doc["date"].replace(tzinfo=pytz.timezone("Europe/Paris"))
        return most_recent_date

    def list_last_large_episodes(
        self, duree_mini_minutes: int = 15
    ) -> List[FeedParserDict]:
        """
        list RSS documents that are
        - newer than get_most_recent_episode_from_DB(self)
        - longer than duree_mini_minutes=15
        """
        last_large_episodes = []
        for entry in self.parsed_flow.entries:
            date_rss = datetime.strptime(entry.published, RSS_DATE_FORMAT)
            date_db = self.get_most_recent_episode_from_DB()
            if date_rss > date_db:
                if (
                    RSS_episode.get_duree_in_seconds(entry.itunes_duration)
                    > duree_mini_minutes * 60
                ):
                    last_large_episodes.append(entry)
        return last_large_episodes

    def store_last_large_episodes(self, duree_mini_minutes: int = 15):
        """
        loop through list_last_large_episodes
        instantiate RSS_episode and keep them
        print the nu;ber of successful updates in DB
        """
        updates = 0
        last_large_episodes = self.list_last_large_episodes(duree_mini_minutes)
        for entry in last_large_episodes:
            rss_entry = RSS_episode.from_feed_entry(entry)
            updates += rss_entry.keep()
        print(f"Updated episodes: {updates}")

In [None]:
podcast = Podcast()
podcast.get_most_recent_episode_from_DB()

datetime.datetime(2025, 1, 12, 9, 59, 39, tzinfo=<DstTzInfo 'Europe/Paris' LMT+0:09:00 STD>)

In [None]:
len(podcast.list_last_large_episodes())

1

In [None]:
podcast.store_last_large_episodes()

Device set to use cpu


Le fichier /home/guillaume/git/lmelp/audios/2025/14007-12.01.2025-ITEMA_23993269-2025F4007S0012-22.mp3 existe déjà. Ignoré.


Device set to use cpu


Episode du 05 Jan 2025 10:59 ignored: Duree: 2898, Type: films


Device set to use cpu


Episode du 29 Dec 2024 10:59 ignored: Duree: 2988, Type: films


Device set to use cpu


Updated episodes: 1


# extract py

In [None]:
from nbdev.export import nb_export

nb_export("py rss helper.ipynb", ".")