In [1]:
from bs4 import BeautifulSoup
import re
import json

In [2]:
def html_to_json(html):
    soup = BeautifulSoup(html, "lxml")

    # MAIN TITLE
    h1 = soup.find("h1")
    title = h1.get_text(" ", strip=True) if h1 else "Untitled"

    # SUBTITLE
    subtitle_elem = soup.find(class_=re.compile(r"HedAndDek_dek"))
    if subtitle_elem:
        subtitle = subtitle_elem.get_text(" ", strip=True)
    else:
        subtitle_elem = soup.find(class_="o-topper__standfirst")
        subtitle = subtitle_elem.get_text(" ", strip=True) if subtitle_elem else ""

    # FOOTNOTES
    footnotes = {}
    footnote_ol = soup.find("ol", class_=re.compile(r"Footnotes_base"))
    if footnote_ol:
        for i, li in enumerate(footnote_ol.find_all("li"), start=1):
            for a in li.find_all("a"):
                if "View in article" in a.get_text(strip=True):
                    a.decompose()
            footnotes[str(i)] = li.get_text(" ", strip=True)

    # SECTIONS
    article_container = soup.find("main") or soup.body
    sections = []
    current_section = {"title": title, "content": []}

    for elem in article_container.find_all(recursive=True):
        if elem == footnote_ol:
            break

        # SUBHEADINGS
        if elem.name in ["h2", "h3"] and any("Subhead_subhead" in c for c in elem.get("class", [])):
            if current_section["content"]:
                sections.append(current_section)
            current_section = {"title": elem.get_text(strip=True), "content": []}

        # BLOCKQUOTES
        elif elem.name == "blockquote" and any("Blockquote_blockquote" in c for c in elem.get("class", [])):
            paragraphs = elem.find_all("p")
            quote = "\n\n".join(p.get_text(" ", strip=True) for p in paragraphs) if paragraphs else elem.get_text(" ", strip=True)
            quote = re.sub(r"[*_]", "", quote)
            current_section["content"].append(f"<start quote>\n{quote}\n<end quote>")

        elif elem.name == "p" and subtitle_elem and subtitle_elem in elem.parents:
            continue

        # PARAGRAPHS
        elif elem.name == "p" and not elem.find_parent("blockquote"):
            paragraph_parts = []
            for child in elem.children:
                if isinstance(child, str):
                    paragraph_parts.append(child)
                elif child.name == "a" and child.get("data-component") == "footnote-link":
                    href = child.get("href", "")
                    match = re.search(r"footnote-(\d+)", href)
                    if match:
                        num = match.group(1)
                        fn_text = footnotes.get(num, "")
                        paragraph_parts.append(f" [Footnote {num}: {fn_text}] ")
                    else:
                        paragraph_parts.append(" " + child.get_text(strip=True) + " ")
                elif child.name == "a":
                    paragraph_parts.append(child.get_text(" ", strip=True))
                else:
                    paragraph_parts.append(child.get_text(" ", strip=True))
            clean_para = re.sub(r"[*_]", "", "".join(paragraph_parts).strip())
            if clean_para:
                current_section["content"].append(clean_para)

        # ORDERED LISTS
        elif elem.name == "ol" and any("OrderedList_orderedList" in c for c in elem.get("class", [])):
            for i, li in enumerate(elem.find_all("li"), start=1):
                item = re.sub(r"[*_]", "", li.get_text(" ", strip=True))
                current_section["content"].append(f"{i}. {item}")

    if current_section["content"]:
        sections.append(current_section)

    return {
        "title": title,
        "subtitle": subtitle,
        "sections": sections
    }

In [3]:
with open("../input/Tech venture firms deploy private equity ‘roll-up’ strategy.html", "r", encoding="utf-8") as f:
    html = f.read()
    structured_json = html_to_json(html)
    with open("Tech venture firms deploy private equity ‘roll-up’ strategy.json", "x") as f:
        json.dump(structured_json, f, indent=4)

In [4]:
with open("../input/A Drug-Trial Stock Sale - Bloomberg.html", "r", encoding="utf-8") as f:
    html = f.read()
    structured_json = html_to_json(html)
    with open("A Drug-Trial Stock Sale.json", "x") as f:
        json.dump(structured_json, f, indent=4)