# Fed Press Releases

In [None]:
import requests
import feedparser
import hashlib
from datetime import datetime
from typing import Optional, Tuple


# ---------------------------------------------------------------------
# Default HTTP headers
# ---------------------------------------------------------------------
# These headers are intentionally browser-like. Many public-sector sites
# (including the Fed and BLS) sit behind CDNs/WAFs that block requests
# with "bot-looking" signatures (e.g., default python-requests UA).
#
# Using conservative, common headers reduces the chance of 403s while
# remaining compliant with usage policies.
DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept": (
        "application/xml, application/rss+xml, application/atom+xml;q=0.9, "
        "*/*;q=0.8"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.federalreserve.gov/feeds/feeds.htm",
}


def fetch_feed(
    url: str,
    *,
    etag: Optional[str] = None,
    last_modified: Optional[str] = None,
    timeout: int = 20,
) -> Tuple[Optional[bytes], Optional[str], Optional[str]]:
    """
    Fetch an RSS/Atom feed using conditional HTTP requests.

    This function performs a GET request with optional ETag and
    Last-Modified headers to avoid re-downloading unchanged feeds.

    Parameters
    ----------
    url : str
        The feed URL to fetch.
    etag : str | None, keyword-only
        Previously-seen ETag value. If provided, sent as If-None-Match.
    last_modified : str | None, keyword-only
        Previously-seen Last-Modified value. If provided, sent as
        If-Modified-Since.
    timeout : int
        Timeout in seconds for the HTTP request.

    Returns
    -------
    content : bytes | None
        Raw feed bytes if the feed has changed; None if HTTP 304
        (Not Modified) was returned.
    new_etag : str | None
        The ETag returned by the server, if any.
    new_last_modified : str | None
        The Last-Modified header returned by the server, if any.

    Notes
    -----
    - HTTP 304 is treated as a first-class outcome, not an error.
    - Callers should persist ETag / Last-Modified values per feed URL.
    """
    headers = dict(DEFAULT_HEADERS)

    # Add conditional headers if we have prior state
    if etag:
        headers["If-None-Match"] = etag
    if last_modified:
        headers["If-Modified-Since"] = last_modified

    response = requests.get(url, headers=headers, timeout=timeout)

    # 304 means "unchanged"; no body will be present
    if response.status_code == 304:
        return None, etag, last_modified

    # Raise for all other non-2xx responses
    response.raise_for_status()

    return (
        response.content,
        response.headers.get("ETag"),
        response.headers.get("Last-Modified"),
    )


def parse_feed_bytes(content: bytes) -> feedparser.FeedParserDict:
    """
    Parse raw RSS/Atom feed bytes into a feedparser object.

    Parameters
    ----------
    content : bytes
        Raw bytes returned from an HTTP request.

    Returns
    -------
    feed : feedparser.FeedParserDict
        Parsed feed object containing metadata and entries.

    Notes
    -----
    - feed.bozo == True indicates malformed XML or non-fatal parse issues.
    - In practice, many real-world feeds are slightly malformed; logging
      bozo_exception is usually sufficient.
    """
    feed = feedparser.parse(content)

    if getattr(feed, "bozo", False):
        # feed.bozo_exception can be logged for observability,
        # but is not necessarily fatal.
        pass

    return feed


def entry_uid(entry) -> str:
    """
    Generate a stable unique identifier for a feed entry.

    Parameters
    ----------
    entry : dict-like
        A single feed entry from feedparser.

    Returns
    -------
    uid : str
        A stable unique identifier suitable for deduplication.

    Strategy
    --------
    1. Prefer entry['id'] if present (most reliable).
    2. Otherwise, hash the canonical link.

    This approach is robust to feeds that:
    - Rewrite recent entries
    - Omit explicit GUIDs
    - Change ordering across polls
    """
    if "id" in entry and entry["id"]:
        return str(entry["id"])

    link = entry.get("link", "")
    return hashlib.sha256(link.encode("utf-8")).hexdigest()


def normalize_entry(entry: dict) -> dict:
    """
    Normalize a raw feed entry into a stable, storage-friendly schema.

    Parameters
    ----------
    entry : dict
        A feedparser entry dictionary.

    Returns
    -------
    normalized : dict
        A normalized representation of the entry with consistent keys.

    Fields
    ------
    uid : str
        Stable unique identifier (see entry_uid).
    title : str | None
        Entry title.
    link : str | None
        Canonical URL for the press release.
    published : str | None
        Publication timestamp, if available.
    summary : str | None
        Short summary or description (often HTML).

    Notes
    -----
    - We intentionally do not parse dates here; downstream code can
      normalize timestamps as needed.
    - This function is designed to be append-only friendly (e.g. JSONL).
    """
    return {
        "uid": entry_uid(entry),
        "title": entry.get("title"),
        "link": entry.get("link"),
        "published": entry.get("published") or entry.get("updated"),
        "summary": entry.get("summary"),
    }


In [5]:
feed = fetch_feed("https://www.federalreserve.gov/feeds/press_all.xml")

In [6]:
feed

(b'\xef\xbb\xbf<?xml version="1.0" encoding="utf-8" ?>\r\n<rss version="2.0">\r\n    <channel>\r\n        <title>FRB: Press Release - All Releases</title>\r\n        <link><![CDATA[https://www.federalreserve.gov/feeds/feeds.htm]]></link>\r\n        <description><![CDATA[All recent press releases from the Federal Reserve Board]]></description>\r\n        <language>en</language>\r\n        <item>\r\n            <title>Federal Reserve Board announces the designation of the Chairs and Deputy Chairs of the 12 Federal Reserve Banks for 2026</title>\r\n            <link><![CDATA[https://www.federalreserve.gov/newsevents/pressreleases/other20260109a.htm]]></link>\r\n            <guid><![CDATA[https://www.federalreserve.gov/newsevents/pressreleases/other20260109a.htm]]></guid>\r\n            <description><![CDATA[Federal Reserve Board announces the designation of the Chairs and Deputy Chairs of the 12 Federal Reserve Banks for 2026]]></description>\r\n            <category>Other Announcements</

In [7]:
parsed_feed = parse_feed_bytes(feed)
parsed_feed


AttributeError: 'tuple' object has no attribute 'encode'

In [8]:
FEED_URL = "https://www.federalreserve.gov/feeds/press_all.xml"
import json
from pathlib import Path

STATE_PATH = Path("state_fed_press.json")

def load_state():
    if not STATE_PATH.exists():
        return {
            "etag": None,
            "last_modified": None,
            "seen_uids": set(),
        }

    raw = json.loads(STATE_PATH.read_text())
    raw["seen_uids"] = set(raw.get("seen_uids", []))
    return raw

def save_state(state):
    to_save = dict(state)
    to_save["seen_uids"] = list(state["seen_uids"])
    STATE_PATH.write_text(json.dumps(to_save, indent=2))

DATA_PATH = Path("fed_press_releases.jsonl")

def append_entries(entries):
    with DATA_PATH.open("a", encoding="utf-8") as f:
        for e in entries:
            f.write(json.dumps(e, ensure_ascii=False) + "\n")


def run_once():
    state = load_state()

    content, new_etag, new_last_modified = fetch_feed(
        FEED_URL,
        etag=state["etag"],
        last_modified=state["last_modified"],
    )

    # Nothing changed
    if content is None:
        print("Feed unchanged")
        return

    feed = parse_feed_bytes(content)

    new_entries = []
    for entry in feed.entries:
        normalized = normalize_entry(entry)

        uid = normalized["uid"]
        if uid in state["seen_uids"]:
            continue

        state["seen_uids"].add(uid)
        new_entries.append(normalized)

    if new_entries:
        append_entries(new_entries)
        print(f"Ingested {len(new_entries)} new entries")
    else:
        print("No new entries after deduplication")

    state["etag"] = new_etag
    state["last_modified"] = new_last_modified
    save_state(state)


In [9]:
run_once()

Ingested 20 new entries


In [10]:
import json
import time
from pathlib import Path
from urllib.parse import urlparse
import requests


def safe_filename_from_url(url: str, *, fallback: str) -> str:
    """
    Create a deterministic, filesystem-safe filename from a URL path.

    Example:
        https://www.federalreserve.gov/newsevents/pressreleases/monetary20260108a.htm
        -> monetary20260108a.htm

    If the URL path doesn't end with something file-like, we use `fallback`.
    """
    path = urlparse(url).path
    name = Path(path).name
    return name if name else fallback


def download_press_release_htmls(
    jsonl_path: str | Path,
    out_dir: str | Path = "fed_press_html",
    *,
    timeout: int = 20,
    sleep_s: float = 0.6,
    overwrite: bool = False,
) -> None:
    """
    Read a JSONL file of normalized feed entries (one JSON object per line),
    download each entry's `link` as HTML, and save it to disk.

    Parameters
    ----------
    jsonl_path:
        Path to your `fed_press_releases.jsonl`.
    out_dir:
        Directory where HTML files will be saved.
    timeout:
        Per-request timeout in seconds.
    sleep_s:
        Politeness delay between requests to avoid hammering the site.
    overwrite:
        If False, skip downloads when the target file already exists.
    """
    jsonl_path = Path(jsonl_path)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    session = requests.Session()
    session.headers.update(
        {
            # Browser-like headers reduce the chance of bot/WAF false positives
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/120.0.0.0 Safari/537.36"
            ),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
            "Referer": "https://www.federalreserve.gov/",
        }
    )

    n_total = 0
    n_downloaded = 0
    n_skipped = 0
    n_failed = 0

    with jsonl_path.open("r", encoding="utf-8") as f:
        for line_no, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            n_total += 1
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"[line {line_no}] JSON decode error: {e}")
                n_failed += 1
                continue

            url = obj.get("link")
            uid = obj.get("uid") or f"line{line_no}"
            if not url:
                print(f"[line {line_no}] Missing 'link'; skipping")
                n_failed += 1
                continue

            filename = safe_filename_from_url(url, fallback=f"{uid}.html")
            out_path = out_dir / filename

            if out_path.exists() and not overwrite:
                n_skipped += 1
                continue

            try:
                resp = session.get(url, timeout=timeout)
                resp.raise_for_status()

                # Save as bytes to preserve original encoding faithfully.
                out_path.write_bytes(resp.content)

                n_downloaded += 1
                if n_downloaded % 25 == 0:
                    print(f"Downloaded {n_downloaded} / {n_total}...")

            except requests.RequestException as e:
                print(f"[line {line_no}] Failed {url}: {e}")
                n_failed += 1

            time.sleep(sleep_s)

    print(
        f"Done. total={n_total}, downloaded={n_downloaded}, skipped={n_skipped}, failed={n_failed}"
    )

In [11]:
download_press_release_htmls(
    jsonl_path="fed_press_releases.jsonl",  # or your exact filename
    out_dir="fed_press_html",
    sleep_s=0.6,
    overwrite=False,
)


Done. total=20, downloaded=20, skipped=0, failed=0


In [12]:
# clean press releases html
import os
breaker = "</noscript>"
# delete all text before breaker
for file in os.listdir("fed_press_html"):
    with open(os.path.join("fed_press_html", file), "r") as f:
        text = f.read()
    text = text.split(breaker)[1]
    with open(os.path.join("fed_press_html", file), "w") as f:
        f.write(text)
