## Alternative version with direct json scraping.

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse
import tqdm
import json
import os

def get_html(url: str):
    # Returns bs4.BeautifulSoup object from url using Requests
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')
    return soup

def get_json(soup):
    # Find script tag with the ID 'react-state'
    script_tag = soup.find('script', id='react-state')
    
    if not script_tag:
        return None

    # Get the JSON text from the script tag
    json_text = script_tag.string

    # The JSON text is inside a JavaScript assignment, so we need to extract it.
    # In your case, the JSON text is being assigned to 'window._REACT_STATE_'.
    # Also, the JSON text ends with ';', which should be stripped off.
    json_text = json_text.replace('window._REACT_STATE_ = ', '')
    if json_text[-1] == ";":
        json_text = json_text[:-1]
    
    json_data = json.loads(json_text)
        
    return json_data   

def is_valid_nts_url(url):
    """
    Validate if URL is a valid NTS URL
    """
    try:
        parsed = urllib.parse.urlparse(url)
        # check if the URL is well-formed
        if not all([parsed.scheme, parsed.netloc, parsed.path]):
            return False
        # check if the URL is https and domain is 'www.nts.live'
        if parsed.scheme != 'https' or parsed.netloc != 'www.nts.live':
            return False
        # check if URL path follows the expected structure
        if not re.match(r'^/shows/.+/episodes/.+$', parsed.path):
            return False
        return True
    except ValueError:
        return False

# Check if the program was interrupted previously by checking if the 'last_processed_line.txt' file exists
if os.path.exists('../data/last_processed_line.txt'):
    with open('data/last_processed_line.txt', 'r') as f:
        start_line = int(f.read())
else:
    start_line = 0

with open('../data/links.txt', 'r') as f:
    lines = f.readlines()

valid_urls = [line.strip() for line in lines if is_valid_nts_url(line.strip())]

with open('../data/tracklists_json.txt', 'a') as f:  # use append mode in case of interruption
    for i, url in enumerate(tqdm.tqdm(valid_urls[start_line:])):
        try:
            html = get_html(url)
            json_data = get_json(html)
            f.write(json.dumps(json_data) + '\n')  

            # Write the last processed line number to a file
            with open('../data/last_processed_line.txt', 'w') as lpl:
                lpl.write(str(start_line + i))
        except Exception as e:
            print(f'Error processing URL {url}: {e}')
            continue


 83%|████████▎ | 24049/28919 [1:35:59<16:38,  4.88it/s]  

Error processing URL https://www.nts.live/shows/aldous-rh/episodes/aldous-rh-8th-february-2020: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


100%|██████████| 28919/28919 [1:56:25<00:00,  4.14it/s]
