## HTML Page to Text File

In [1]:
from bs4 import BeautifulSoup
import re

In [2]:
def extract_bloomberg_content(html):
    soup = BeautifulSoup(html, "lxml")
    
    # STEP 0: Get h1 title
    h1 = soup.find("h1")
    title = h1.get_text(" ", strip=True) if h1 else "Untitled"

    # STEP 1: Extract footnotes
    footnotes = {}
    footnote_ol = soup.find("ol", class_=re.compile(r"Footnotes_base"))
    if footnote_ol:
        for i, li in enumerate(footnote_ol.find_all("li"), start=1):
            # Remove "View in article" links only, preserve other text
            for a in li.find_all("a"):
                if "View in article" in a.get_text(strip=True):
                    a.decompose()
            # Get the remaining full text, including hyperlink text
            footnotes[str(i)] = li.get_text(" ", strip=True)

    # STEP 2: Parse content
    content_blocks = [f"{title}"]
    article_container = soup.find("main") or soup.body

    for elem in article_container.find_all(recursive=True):
        if elem == footnote_ol:
            break

        # SUBHEADINGS
        if elem.name in ["h2", "h3"] and any("Subhead_subhead" in c for c in elem.get("class", [])):
            content_blocks.append(f"\n\n{elem.get_text(strip=True)}:")

        # BLOCKQUOTES
        elif elem.name == "blockquote" and any("Blockquote_blockquote" in c for c in elem.get("class", [])):
            paragraphs = elem.find_all("p")
            if paragraphs:
                quote_paragraphs = [p.get_text(" ", strip=True) for p in paragraphs]
                quote = "\n\n".join(quote_paragraphs)
            else:
                quote = elem.get_text(" ", strip=True)
            quote = re.sub(r"[*_]", "", quote)
            content_blocks.append(f"start quote\n{quote}\nend quote")

        # PARAGRAPHS (skip inside blockquotes to avoid duplication)
        elif elem.name == "p":
            if elem.find_parent("blockquote"):
                continue

            paragraph_parts = []

            for child in elem.children:
                if isinstance(child, str):
                    paragraph_parts.append(child)
                elif child.name == "a" and child.get("data-component") == "footnote-link":
                    href = child.get("href", "")
                    match = re.search(r"footnote-(\d+)", href)
                    if match:
                        num = match.group(1)
                        fn_text = footnotes.get(num, "")
                        paragraph_parts.append(f" [Footnote {num}: {fn_text}] ")
                    else:
                        paragraph_parts.append(" " + child.get_text(strip=True) + " ")
                elif child.name == "a":
                    link_text = child.get_text(" ", strip=True)
                    paragraph_parts.append(link_text)
                else:
                    paragraph_parts.append(child.get_text(" ", strip=True))

            clean_para = "".join(paragraph_parts).strip()
            clean_para = re.sub(r"[*_]", "", clean_para)
            content_blocks.append(clean_para)

        # ORDERED LISTS
        elif elem.name == "ol" and any("OrderedList_orderedList" in c for c in elem.get("class", [])):
            list_items = [li.get_text(" ", strip=True) for li in elem.find_all("li")]
            for i, item in enumerate(list_items, start=1):
                item = re.sub(r"[*_]", "", item)
                content_blocks.append(f"{i}. {item}")

    return "\n\n".join(content_blocks), title

In [None]:
from pathlib import Path

directory_path = Path('input/')

if directory_path.is_dir():
    for entry in directory_path.iterdir():
        with open(entry, "r", encoding="utf-8") as f:
            html = f.read()
        article, title = extract_bloomberg_content(html)
        article = re.sub(r"\s+([.,;:!?])", r"\1", article)
        with open(f"output/Money Stuff - {title}.txt", "w") as f:
            f.write(article)

In [None]:
print(article)