In [2]:
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse
import tqdm
import json
import os

def get_html(url: str):
    # Returns bs4.BeautifulSoup object from url using Requests
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')
    return soup

def get_mix_title(soup):
    # Returns title of show from bs4 object containing NTS 'shows' html
    return soup.find("h1").text

def format_name(name: str):
    # format name - for now just removed commas if last character
    if ',' in name:
        name = name.replace(',','')
    return name

def get_tracklist(soup):
    # Returns list containing basic track metadata (artist, track, url and episode number from bs4 object containing NTS 'episodes' html
    tracklist = []
    for element in soup.find_all("li"):
        artist = element.find("span",{"class":"track__artist"}).text
        track = element.find("span",{"class":"track__title"}).text

        tracklist.append({"artist": format_name(artist),
                          "track" : track})
    return tracklist

def is_valid_nts_url(url):
    """
    Validate if URL is a valid NTS URL
    """
    try:
        parsed = urllib.parse.urlparse(url)
        # check if the URL is well-formed
        if not all([parsed.scheme, parsed.netloc, parsed.path]):
            return False
        # check if the URL is https and domain is 'www.nts.live'
        if parsed.scheme != 'https' or parsed.netloc != 'www.nts.live':
            return False
        # check if URL path follows the expected structure
        if not re.match(r'^/shows/.+/episodes/.+$', parsed.path):
            return False
        return True
    except ValueError:
        return False

def get_mix_title_from_url(url: str):
    """
    Extract the show title from a NTS URL
    """
    # split the url by slashes and get the last part as the title
    title = url.split("/")[-1]

    return title

# Check if the program was interrupted previously by checking if the 'last_processed_line.txt' file exists
if os.path.exists('data/last_processed_line.txt'):
    with open('data/last_processed_line.txt', 'r') as f:
        start_line = int(f.read())
else:
    start_line = 0

with open('data/links.txt', 'r') as f:
    lines = f.readlines()

valid_urls = [line.strip() for line in lines if is_valid_nts_url(line.strip())]

with open('data/tracklists.txt', 'a') as f:  # use append mode in case of interruption
    for i, url in enumerate(tqdm.tqdm(valid_urls[start_line:])):
        try:
            html = get_html(url)
            title = get_mix_title_from_url(url)
            pretty_title = get_mix_title(html)
            tracklist = get_tracklist(html)
            f.write(json.dumps({"title": title, "pretty-title": pretty_title, "tracklist": tracklist}) + '\n')  

            # Write the last processed line number to a file
            with open('data/last_processed_line.txt', 'w') as lpl:
                lpl.write(str(start_line + i))
        except Exception as e:
            print(f'Error processing URL {url}: {e}')
            continue


 12%|█▏        | 3410/28919 [13:32<1:33:19,  4.56it/s]

Error processing URL https://www.nts.live/shows/babastiltz/episodes/babastiltz-invalid%20date: 'NoneType' object has no attribute 'text'


 19%|█▊        | 5355/28919 [21:05<1:26:16,  4.55it/s]

Error processing URL Https://www.nts.live/shows/guests/episodes/kẹo-keo-7th-november-2022: 'NoneType' object has no attribute 'text'


 19%|█▊        | 5394/28919 [21:13<1:24:41,  4.63it/s]

Error processing URL https://www.nts.live/shows/onotesla/episodes/onotesla-5th-november-2022: 'NoneType' object has no attribute 'text'


 20%|██        | 5786/28919 [22:50<1:27:40,  4.40it/s]

Error processing URL https://www.nts.live/shows/soup-to-nuts-ruf-dug/episodes/soup-to-nuts-ruf-dug-17th-october-2022: 'NoneType' object has no attribute 'text'


 22%|██▏       | 6293/28919 [24:49<1:33:47,  4.02it/s]

Error processing URL https://www.nts.live/shows/guests/episodes/a%C2%B2z-21st-september-2022: 'NoneType' object has no attribute 'text'


 24%|██▍       | 6973/28919 [27:29<1:17:03,  4.75it/s]

Error processing URL https://www.nts.live/shows/soup-to-nuts-ruf-dug/episodes/soup-to-nuts-ruf-dug-22nd-august-2022: 'NoneType' object has no attribute 'text'


 32%|███▏      | 9184/28919 [36:03<1:15:56,  4.33it/s]

Error processing URL https://www.nts.live/shows/guests/episodes/civilist-j%C3%A4vel-6th-may-2022: 'NoneType' object has no attribute 'text'


 37%|███▋      | 10633/28919 [41:41<1:02:27,  4.88it/s]

Error processing URL https://www.nts.live/shows/guests/episodes/nyokabi-kari%C5%A9ki-25th-february-2022: 'NoneType' object has no attribute 'text'


 39%|███▊      | 11137/28919 [43:39<1:04:34,  4.59it/s]

Error processing URL https://www.nts.live/shows/guests/episodes/indr%C4%97-jurgelevi%C4%8Di%C5%ABt%C4%97-2nd-february-2022: 'NoneType' object has no attribute 'text'


 62%|██████▏   | 18019/28919 [1:09:49<39:49,  4.56it/s]  

Error processing URL https://www.nts.live/shows//andy-butler/episodes/andy-butler-29th-january-2021: 'NoneType' object has no attribute 'text'


 73%|███████▎  | 21005/28919 [1:20:34<29:14,  4.51it/s]  

Error processing URL https://www.nts.live/shows/jen-monroe/episodes/getting-warmer-w-jen-monroe-5th-august-2020: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 73%|███████▎  | 21031/28919 [1:20:39<30:34,  4.30it/s]

Error processing URL Https://www.nts.live/shows/afterworld/episodes/spacebrat-4th-august-2020: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 73%|███████▎  | 21035/28919 [1:20:40<27:59,  4.69it/s]

Error processing URL https://www.nts.live/shows/alien-jams/episodes/alien-jams-4th-tueday-2020: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 73%|███████▎  | 21038/28919 [1:20:41<24:56,  5.27it/s]

Error processing URL https://www.nts.live/shows/channeling/episodes/channeling-4th-august-2020: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 73%|███████▎  | 21042/28919 [1:20:41<18:20,  7.16it/s]

Error processing URL https://www.nts.live/shows/prosumer/episodes/prosumer-4th-august-2020: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Error processing URL https://www.nts.live/shows/angel-bat-dawid/episodes/angel-bat-dawid-4th-august-2020: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 73%|███████▎  | 21044/28919 [1:20:41<21:23,  6.13it/s]

Error processing URL https://www.nts.live/shows/ZULI/episodes/zuli-4th-august-2020: ('Connection aborted.', OSError(22, 'Invalid argument'))


 73%|███████▎  | 21048/28919 [1:20:42<25:56,  5.06it/s]

Error processing URL Https://www.nts.live/shows/rare-species/episodes/rare-species-4th-august-2020: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 73%|███████▎  | 21050/28919 [1:20:43<27:22,  4.79it/s]

Error processing URL Https://www.nts.live/shows/jazmin-garcia/episodes/como-la-flor-w-jazmin-3rd-august-2020: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 73%|███████▎  | 21062/28919 [1:20:45<28:02,  4.67it/s]

Error processing URL https://www.nts.live/shows/leila-samir/episodes/leila-samir-3rd-august-2020: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 82%|████████▏ | 23670/28919 [1:30:08<22:18,  3.92it/s]

Error processing URL https://www.nts.live/shows/questing-w-zakia/episodes/questing-w-zakia-29th-february-2020: 'NoneType' object has no attribute 'text'


 83%|████████▎ | 23944/28919 [1:31:09<18:44,  4.42it/s]

Error processing URL Https://www.nts.live/shows/slap-hour/episodes/the-slap-pack-w-dj-kita-indigo-sevyn-12th-february-2020: 'NoneType' object has no attribute 'text'


 85%|████████▍ | 24443/28919 [1:32:58<16:28,  4.53it/s]

Error processing URL https://www.nts.live/shows/questing-w-zakia/episodes/questing-w-zakia-18th-january-2020: 'NoneType' object has no attribute 'text'


 85%|████████▍ | 24468/28919 [1:33:03<15:36,  4.76it/s]

Error processing URL https://www.nts.live/shows/guests/episodes/na-o-mi-16th-january-2020: 'NoneType' object has no attribute 'text'


 86%|████████▋ | 24966/28919 [1:34:50<13:49,  4.77it/s]

Error processing URL https://www.nts.live/shows/coucou-chlo%C3%A9/episodes/coucou-chloe-5th-december-2019: 'NoneType' object has no attribute 'text'


 87%|████████▋ | 25089/28919 [1:35:17<15:00,  4.25it/s]

Error processing URL Https://www.nts.live/shows/kelsey-lu/episodes/pteropods-w-kelsey-lu-26th-november-2019: 'NoneType' object has no attribute 'text'


 87%|████████▋ | 25135/28919 [1:35:27<13:00,  4.85it/s]

Error processing URL https://www.nts.live/shows/questing-w-zakia/episodes/questing-w-zakia-23rd-november-2019: 'NoneType' object has no attribute 'text'


 87%|████████▋ | 25207/28919 [1:35:42<13:00,  4.75it/s]

Error processing URL Https://www.nts.live/shows/andre-power/episodes/getting-lost-in-foreign-places-w-andre-power-18th-novemer-2019: 'NoneType' object has no attribute 'text'


 87%|████████▋ | 25262/28919 [1:35:54<13:29,  4.52it/s]

Error processing URL Https://www.nts.live/shows/guests/episodes/moniquerene-16th-november-2019: 'NoneType' object has no attribute 'text'


 87%|████████▋ | 25282/28919 [1:35:59<14:20,  4.23it/s]

Error processing URL Https://www.nts.live/shows/ana-roxanne/episodes/spirit-people-w-angela-lin-14th-november-2019: 'NoneType' object has no attribute 'text'


 88%|████████▊ | 25564/28919 [1:37:04<17:26,  3.21it/s]

Error processing URL Https://www.nts.live/shows/dina-j/episodes/high-noon-w-dina-j-25th-october-2019: 'NoneType' object has no attribute 'text'


 88%|████████▊ | 25583/28919 [1:37:09<13:44,  4.04it/s]

Error processing URL Https://www.nts.live/shows/sweet-boys/episodes/sweetboy-25th-october-2019: 'NoneType' object has no attribute 'text'


 89%|████████▉ | 25848/28919 [1:38:05<17:24,  2.94it/s]

Error processing URL Https://www.nts.live/shows/guests/episodes/me-gusta-5th-october-2019: 'NoneType' object has no attribute 'text'


 89%|████████▉ | 25865/28919 [1:38:09<12:58,  3.92it/s]

Error processing URL Https://www.nts.live/shows/siren/episodes/siren-3rd-october-2019: 'NoneType' object has no attribute 'text'


 91%|█████████ | 26348/28919 [1:39:59<09:36,  4.46it/s]

Error processing URL Https://www.nts.live/shows/questing-w-zakia/episodes/questing-w-zakia-14th-june-2019: 'NoneType' object has no attribute 'text'


 91%|█████████▏| 26405/28919 [1:40:13<09:28,  4.42it/s]

Error processing URL Https://www.nts.live/shows/palmwineclub/episodes/palm-wine-club-8th-june-2019: 'NoneType' object has no attribute 'text'


 91%|█████████▏| 26406/28919 [1:40:13<10:13,  4.10it/s]

Error processing URL Https://www.nts.live/shows/the-new-funk-breakfast-show/episodes/the-poaetry-show-8th-june-2019: 'NoneType' object has no attribute 'text'


 93%|█████████▎| 26876/28919 [1:42:02<07:23,  4.61it/s]

Error processing URL Https://www.nts.live/shows/swing-ting/episodes/swing-ting-w-joeyb-and-dj-tappa-benz-13th-april-2019: 'NoneType' object has no attribute 'text'


 94%|█████████▍| 27237/28919 [1:43:28<05:38,  4.97it/s]

Error processing URL Https://www.nts.live/shows/circadian-rhythms/episodes/circadian-rhythms-14th-march-2019: 'NoneType' object has no attribute 'text'


 95%|█████████▌| 27572/28919 [1:44:47<05:20,  4.20it/s]

Error processing URL Https://www.nts.live/shows/guests/episodes/haleek-maul-12th-june-2018: 'NoneType' object has no attribute 'text'


 96%|█████████▋| 27855/28919 [1:45:53<03:56,  4.50it/s]

Error processing URL https://www.nts.live/shows/body-motion/episodes/body-motion-27th-april-2018: 'NoneType' object has no attribute 'text'


 96%|█████████▋| 27862/28919 [1:45:54<04:08,  4.26it/s]

Error processing URL https://www.nts.live/shows/young-turks/episodes/young-turks-27th-april-2018: 'NoneType' object has no attribute 'text'


 97%|█████████▋| 28024/28919 [1:46:34<03:31,  4.24it/s]

Error processing URL https://www.nts.live/shows/guests/episodes/bamao-yenda-9th-april-2018: 'NoneType' object has no attribute 'text'


 99%|█████████▉| 28610/28919 [1:48:52<01:13,  4.20it/s]

Error processing URL https://www.nts.live/shows/naomi/episodes/naomi-hello-dj-13th-december-2017: 'NoneType' object has no attribute 'text'


100%|██████████| 28919/28919 [1:50:07<00:00,  4.38it/s]
