# Update Dataset

With this notebook, we check whether new episodes have come out, and make sure to add them to the dataset as needed. To accomplish this, we perform the following steps:
* Check for new episodes on website
* Download mp3 files and scrape episode information
* Process mp3 with whisper
* Add all relevant updates to the main dataframe

Secondary to this, we also check for updated titles as this sometimes causes issues.

In [None]:
#base
import re
import os
import pandas as pd

#webscraper
from selenium import webdriver
from selenium.webdriver.common.by import By
import urllib.request

#multiprocessing
import multiprocessing as mp

#slow the scraper down a little
import time

# Load data file

In [None]:
data = pd.read_pickle("../extract_data/data.pickle")

In [None]:
data.sort_values("date", ascending=False)

# Check for new episodes

In [None]:
driver = webdriver.Chrome(executable_path="../dependencies/chromedriver")
url = "https://podcastluisteren.nl/pod/Maarten-van-Rossem-De-Podcast"
driver.get(url)

In [None]:
elements = driver.find_elements(By.XPATH, "//h4[@class='mt-1 text-left']")

In [None]:
titles = [element.text for element in elements]
titles = [title.replace("/", "-") for title in titles]
data_web = pd.DataFrame()
data_web["titles"] = titles

In [None]:
date_duration = driver.find_elements(By.XPATH, "//h4[@class='text-left mb-4']")
date_duration = [element.text for element in date_duration]
data_web["date_and_duration"] = date_duration
temp = data_web["date_and_duration"].str.split("|", n = 1, expand = True)
data_web["date"] = temp[0]
data_web["duration"] = temp[1]
data_web = data_web.drop(columns="date_and_duration")
data_web["date"] = pd.to_datetime(data_web['date'])

In [None]:
# data_web["sources"] = sources
episode = data_web["titles"].str.findall("(?:#)(\d+)").str[0]
data_web["episode"] = episode
data_web["episode"] = data_web["episode"].fillna(-9999)
data_web["episode"] = data_web["episode"].astype(int)
data_web["mp3_path"] = data_web["titles"].transform(lambda title: f"../data/audio/{title}.mp3")
data_web["txt_path"] = data_web["titles"].transform(lambda title: f"../data/text/file:{title}.mp3.txt")

In [None]:
data_web = data_web[(data_web.date >= data.date.max()) & ~data_web.titles.isin(data.titles)]

In [None]:
updated_titles = data_web[data_web.date.isin(data.date) & data_web.duration.isin(data.duration) & data_web.episode.isin(data.episode)]
updated_titles

In [None]:
data_web = data_web.drop(updated_titles.index, axis=0)

# Download mp3 files and scrape episode information

In [None]:
elements = [element for element in elements if element.text.replace("/", "-") in data_web.titles.values]
buttons = [element.find_element(By.XPATH, "../div/button") for element in elements]

In [None]:
def find_audio_path(button, audio_element):
    # Start stream of episode
    button.click()
    # Pause the stream, as we only need it loaded
    button.click()
    time.sleep(0.01)
    
    src = audio_element.get_attribute("src")
    return src

In [None]:
audio = driver.find_element(By.XPATH, "//audio")
buttons = [element.find_element(By.XPATH, "../div/button") for element in elements]
sources = [find_audio_path(button, audio) for button in buttons]

In [None]:
driver.close()

In [None]:
use_cores = mp.cpu_count()

In [None]:
def download_mp3(source, title):
    path = f"../data/audio/{title}.mp3"
    if os.path.exists(path):
        return
    urllib.request.urlretrieve(source, path)
    time.sleep(2)

In [None]:
pool = mp.Pool(use_cores)
result = pool.starmap(download_mp3, tuple(zip(sources, titles)))

# Process mp3 with whisper


with open("update_mp3.txt","w") as f:
    for mp3 in data_incomplete["mp3_path"]:
        path = mp3.split("audio/")[1]
        f.write(f"{path}\n")

In [None]:
data_web

In [None]:
with open("update_mp3.txt","w") as f:
    for mp3 in data_web["mp3_path"]:
        path = mp3.split("audio/")[1]
        f.write(f"{path}\n")

In [None]:
! ../speech_to_text/transcribe_update.sh

# Add all relevant updates to the main dataframe

In [None]:
data_web["sources"] = sources
data = pd.concat([data, data_web])

In [None]:
data = data.sort_values("episode", ascending=False).reset_index(drop=True)

In [None]:
data.to_pickle("../extract_data/data.pickle")