## HTML Page to Text File with Summary & Context

In [1]:
from google import genai
from google.genai.types import HttpOptions
from bs4 import BeautifulSoup
import re
import config

client = genai.Client(api_key=config.GEMINI_API_KEY,
                     http_options=HttpOptions(api_version="v1alpha"))

In [2]:
def extract_bloomberg_content_structured(html):
    soup = BeautifulSoup(html, "lxml")

    # STEP 0: Get h1 title
    h1 = soup.find("h1")
    title = h1.get_text(" ", strip=True) if h1 else "Untitled"

    # STEP 1: Extract footnotes
    footnotes = {}
    footnote_ol = soup.find("ol", class_=re.compile(r"Footnotes_base"))
    if footnote_ol:
        for i, li in enumerate(footnote_ol.find_all("li"), start=1):
            for a in li.find_all("a"):
                if "View in article" in a.get_text(strip=True):
                    a.decompose()
            footnotes[str(i)] = li.get_text(" ", strip=True)

    # STEP 2: Parse content into structured sections
    article_container = soup.find("main") or soup.body
    sections = []
    current_section = {"title": title, "content": []}

    for elem in article_container.find_all(recursive=True):
        if elem == footnote_ol:
            break

        # SUBHEADINGS
        if elem.name in ["h2", "h3"] and any("Subhead_subhead" in c for c in elem.get("class", [])):
            if current_section["content"]:
                sections.append(current_section)
            current_section = {"title": elem.get_text(strip=True), "content": []}

        # BLOCKQUOTES
        elif elem.name == "blockquote" and any("Blockquote_blockquote" in c for c in elem.get("class", [])):
            paragraphs = elem.find_all("p")
            if paragraphs:
                quote_paragraphs = [p.get_text(" ", strip=True) for p in paragraphs]
                quote = "\n\n".join(quote_paragraphs)
            else:
                quote = elem.get_text(" ", strip=True)
            quote = re.sub(r"[*_]", "", quote)
            current_section["content"].append(f"start quote\n{quote}\nend quote")

        # PARAGRAPHS
        elif elem.name == "p":
            if elem.find_parent("blockquote"):
                continue

            paragraph_parts = []

            for child in elem.children:
                if isinstance(child, str):
                    paragraph_parts.append(child)
                elif child.name == "a" and child.get("data-component") == "footnote-link":
                    href = child.get("href", "")
                    match = re.search(r"footnote-(\d+)", href)
                    if match:
                        num = match.group(1)
                        fn_text = footnotes.get(num, "")
                        paragraph_parts.append(f" [Footnote {num}: {fn_text}] ")
                    else:
                        paragraph_parts.append(" " + child.get_text(strip=True) + " ")
                elif child.name == "a":
                    link_text = child.get_text(" ", strip=True)
                    paragraph_parts.append(link_text)
                else:
                    paragraph_parts.append(child.get_text(" ", strip=True))

            clean_para = "".join(paragraph_parts).strip()
            clean_para = re.sub(r"[*_]", "", clean_para)
            current_section["content"].append(clean_para)

        # ORDERED LISTS
        elif elem.name == "ol" and any("OrderedList_orderedList" in c for c in elem.get("class", [])):
            list_items = [li.get_text(" ", strip=True) for li in elem.find_all("li")]
            for i, item in enumerate(list_items, start=1):
                item = re.sub(r"[*_]", "", item)
                current_section["content"].append(f"{i}. {item}")

    if current_section["content"]:
        sections.append(current_section)

    return sections, title

In [3]:
def insert_summaries_into_sections(sections, title):

    for idx, section in enumerate(sections):
        # Rule 1: Skip title block (main article title)
        if section['title'] == title:
            continue

        # Rule 2: Stop summarizing at "Things happen"
        if section["title"].strip().lower() == "things happen":
            break

        content_text = "\n\n".join(section["content"])

        prompt = f"""
You are a helpful summarizer and analyst.

Given the following section of a Bloomberg newsletter, do the following:
1. Provide a concise summary in 2-3 sentences.
2. Identify the main topics or themes discussed.
3. Preserve any important nuance, tone, or humor. 
4. Describe the broader context, article-level insight and background.
5. Mention any notable financial terms or references.
6. Write this all into a passage in plain simple text without markdown formatting so it can be read directly by a text-to-speech application
7. Avoid repeating quotes — just explain what's being said in simpler terms.

Here is the section:
{content_text}
"""

        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash", 
                contents=prompt
            )
            section["summary"] = response.text.strip()
        except Exception as e:
            print(f"❌ Gemini failed for section '{section['title']}': {e}")
            section["summary"] = "[Summary unavailable due to API error]"

    return sections

In [4]:
def write_sections_with_summaries(sections, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for idx, section in enumerate(sections):
            # Title (always written)
            f.write(section["title"] + ":\n\n")

            # Summary if available
            if idx > 0 and "summary" in section:
                f.write(f"Summary of {section['title']}\n\n")
                f.write(section["summary"] + "\n\n")
                f.write("End of summary\n\n")

            # Section content
            for paragraph in section["content"]:
                f.write(re.sub(r"\s+([.,;:!?])", r"\1", paragraph) + "\n\n")

In [5]:
from pathlib import Path

directory_path = Path('input/')

if directory_path.is_dir():
    for entry in directory_path.iterdir():
        with open(entry, "r", encoding="utf-8") as f:
            html = f.read()
        article, title = extract_bloomberg_content_structured(html)
        article = insert_summaries_into_sections(article, title)
        write_sections_with_summaries(article, f"output/Money Stuff - {title}.txt")