## HTML Page to Text File with Summary & Context

In [1]:
import json
from google import genai
from google.genai.types import HttpOptions
import config

client = genai.Client(api_key=config.GEMINI_API_KEY,
                     http_options=HttpOptions(api_version="v1alpha"))

In [2]:
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
        
def create_summary_prompt(section_content):
    return f"""
You are a helpful summarizer and analyst.

Given the following section of a Bloomberg newsletter, do the following:
1. Provide a concise summary in 2-3 sentences.
2. Identify the main topics or themes discussed.
3. Preserve any important nuance, tone, or humor. 
4. Describe the broader context, article-level insight and background.
5. Mention any notable financial terms or references.
6. Write this all into a passage in plain simple text without markdown formatting so it can be read directly by a text-to-speech application
7. Avoid repeating quotes — just explain what's being said in simpler terms.

Here is the section:
---
{section_content}
---
"""

In [3]:
def summarize_article_sections(sections, main_title, gemini_client, stop_phrase=None):
    for section in sections:
        section_title = section.get("title", "").strip()

        if stop_phrase and section_title.lower() == stop_phrase.lower():
            break

        if section_title == main_title:
            continue

        content_text = "\n\n".join(section.get("content", []))
        
        if not content_text:
            continue

        prompt = create_summary_prompt(content_text)

        try:
            response = gemini_client.models.generate_content(
                model="gemini-2.5-flash", 
                contents=prompt
            )
            section["summary"] = response.text.strip()
        except Exception as e:
            print(f"  -> ❌ Gemini API failed for section '{section_title}': {e}")
            section["summary"] = "[Summary unavailable due to an API error]"

    return sections

In [4]:
def main(input_path, output_path, stop_at_title="Things happen"):
    article_data = load_json(input_path)
    if not article_data:
        return
    
    sections = article_data.get("sections", [])
    main_title = article_data.get("title", "")

    summarized_sections = summarize_article_sections(
        sections=sections,
        main_title=main_title,
        gemini_client=client,
        stop_phrase=stop_at_title
    )

    article_data["sections"] = summarized_sections

    save_json(article_data, output_path)

In [5]:
INPUT_JSON_PATH = "processed-json/Money Stuff - A Drug-Trial Stock Sale.json"
OUTPUT_JSON_PATH = "Money Stuff - A Drug-Trial Stock Sale.json.json"

STOP_PHRASE = "Things happen" 

main(input_path=INPUT_JSON_PATH, output_path=OUTPUT_JSON_PATH, stop_at_title=STOP_PHRASE)