# Load libraries

In [None]:
# base
import re
import os
import pandas as pd

# webscraper
from selenium import webdriver
from selenium.webdriver.common.by import By
import urllib.request

# multiprocessing
import multiprocessing as mp

# slow the scraper down a little
import time


# setup selenium scraper

We use a Selenium scraper in order to retrieve the audio sources from a website. We use an interactive selenium scraper for this as it allows for us to interact with the webpage. The audio-source is included in the html page when we activate the play-button for the various episodes. There might be an easier way to extract the audio-source without needing interactivity, but for now this was already a simple enough to implement.

In [None]:
driver = webdriver.Chrome(executable_path="../dependencies/chromedriver")


In [None]:
url = "https://podcastluisteren.nl/pod/Maarten-van-Rossem-De-Podcast"
driver.get(url)


# Find podcast titles and download links

We loop through the various html-elements containing episodes in order to retrieve information such as the titles, release date, duration, etc.

In [None]:
elements = driver.find_elements(By.XPATH, "//h4[@class='mt-1 text-left']")


In [None]:
titles = [element.text for element in elements]
titles = [title.replace("/", "-") for title in titles]


In [None]:
data = pd.DataFrame()
data["titles"] = titles


In [None]:
buttons = [element.find_element(By.XPATH, "../div/button") for element in elements]


In [None]:
date_duration = driver.find_elements(By.XPATH, "//h4[@class='text-left mb-4']")
date_duration = [element.text for element in date_duration]


In [None]:
data["date_and_duration"] = date_duration


Now we will interact with the webpage in order to retrieve the audio source links, which we can then use in order to download the audio files.

In [None]:
def find_audio_path(button, audio_element):
    """
    Start playing the audofiles and retrieve the src attribute.

    The src attribute is only available after getting we start playing the audiofile.
    The play button is clicked, making the source available.

    Parameters
    ----------
    button: selenium.element
        A play button on the website.
    audio_element: selenium.element
        The element containing the audio src.

    Returnsa
    -------
    src: str
        The link to the audiofile.
    """
    # Start stream of episode
    button.click()
    # Pause the stream, as we only need it loaded
    button.click()
    time.sleep(0.01)

    src = audio_element.get_attribute("src")
    return src


In [None]:
audio = driver.find_element(By.XPATH, "//audio")
sources = [find_audio_path(button, audio) for button in buttons]


In [None]:
data["sources"] = sources


Here we clean and transform some of the data stored in our DataFrame.

In [None]:
temp = data["date_and_duration"].str.split("|", n=1, expand=True)
data["date"] = temp[0]
data["duration"] = temp[1]
data = data.drop(columns="date_and_duration")


In [None]:
data["date"] = pd.to_datetime(data["date"])


In [None]:
episode = data["titles"].str.findall("(?:#)(\d+)").str[0]
data["episode"] = episode


In [None]:
data["episode"] = data["episode"].fillna(-9999)
data["episode"] = data["episode"].astype(int)


In [None]:
data["mp3_path"] = data["titles"].transform(lambda title: f"../data/audio/{title}.mp3")


In [None]:
data["txt_path"] = data["titles"].transform(
    lambda title: f"../data/text/file:{title}.mp3.txt"
)


In [None]:
data


### Write data to file

In [None]:
data.to_pickle("data.pickle")


# Download the mp3 files

In [None]:
use_cores = mp.cpu_count()


In [None]:
def download_mp3(source, title):
    """
    Download the audiofile from the source.
    The episode title is used for naming the file.

    Parameters
    ----------
    source : str
        Link to the audiofile.
    title : str
        title of the episode.
    """
    path = f"../data/audio/{title}.mp3"
    if os.path.exists(path):
        # if the path already exists, it is not downloaded again.
        return
    urllib.request.urlretrieve(source, path)
    time.sleep(2)


In [None]:
pool = mp.Pool(use_cores)
result = pool.starmap(download_mp3, tuple(zip(sources, titles)))
