In [1]:
from bs4 import BeautifulSoup
import re
import json
from pathlib import Path

In [2]:
def html_to_json(html):
    soup = BeautifulSoup(html, "lxml")

    # Get h1 title
    h1 = soup.find("h1")
    title = h1.get_text(" ", strip=True) if h1 else "Untitled"
    
    # Get Subtitle
    subtitle_elem = soup.find(class_=re.compile(r"HedAndDek_dek"))
    if subtitle_elem:
        subtitle = subtitle_elem.get_text(" ", strip=True)
    else:
        subtitle_elem = soup.find(class_="o-topper__standfirst")
        subtitle = subtitle_elem.get_text(" ", strip=True) if subtitle_elem else ""
    
    # Get footnotes        
    footnotes = {}
    footnote_ol = soup.find("ol", class_=re.compile(r"Footnotes_base"))
    if footnote_ol:
        for i, li in enumerate(footnote_ol.find_all("li"), start=1):
            for a in li.find_all("a"):
                if "View in article" in a.get_text(strip=True):
                    a.decompose()
            footnotes[str(i)] = li.get_text(" ", strip=True)
    
    # Parse content
    article_container = soup.find("main") or soup.body
    sections = []
    current_section = {"title": title, "content": []}

    # Subheadings, block quotes, paragraphs, ordered list 
    for elem in article_container.find_all(['h2', 'h3', 'blockquote', 'p', 'ol'], recursive=True):
        if footnote_ol and elem.find_parent("ol", class_=re.compile(r"Footnotes_base")):
            continue
        
        if elem.name in ["h2", "h3"] and any("Subhead_subhead" in c for c in elem.get("class", [])):
            if current_section["content"]:
                sections.append(current_section)
            current_section = {"title": elem.get_text(strip=True), "content": []}
            continue

        for fn_link in elem.find_all("a", attrs={"data-component": "footnote-link"}):
            href = fn_link.get("href", "")
            match = re.search(r"footnote-(\d+)", href)
            if match:
                num = match.group(1)
                fn_text = footnotes.get(num, "")
                fn_link.replace_with(f" [Footnote {num}: {fn_text}] ")
        
        raw_text = elem.get_text(separator=' ', strip=True)
        clean_text = re.sub(r'\s+([,.!?;:])', r'\1', raw_text)

        if not clean_text:
            continue

        if elem.name == "blockquote":
            current_section["content"].append(f"<start quote>\n{clean_text}\n<end quote>")
            continue 

        elif elem.name == "p":
            if clean_text.startswith("If you'd like to get Money Stuff in handy email form"):
                break            
            if subtitle_elem and clean_text == subtitle:
                continue
            if elem.find_parent("blockquote"):
                continue
            current_section["content"].append(clean_text)
            
        elif elem.name == "ol":
            list_items = []
            for li in elem.find_all("li"):
                li_raw_text = li.get_text(separator=' ', strip=True)
                li_clean_text = re.sub(r'\s+([,.!?;:])', r'\1', li_raw_text)
                if li_clean_text:
                    list_items.append(li_clean_text)
            
            for i, item_text in enumerate(list_items, start=1):
                 current_section["content"].append(f"{i}. {item_text}")
            continue 

    if current_section["content"]:
        sections.append(current_section)

    return {"title": title, "subtitle": subtitle, "sections": sections}, title

In [3]:
directory_path = Path('input/')

if directory_path.is_dir():
    for entry in directory_path.iterdir():
        with open(entry, "r", encoding="utf-8") as f:
            html = f.read()
        article, title = html_to_json(html)
        with open(f"output/Money Stuff - {title}.json", "w") as f:
            json.dump(article, f, indent=4)