In [34]:
import os, time, random, requests, re
import pandas as pd
from tqdm.auto import tqdm
from dateutil.parser import parse as dtparse
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

In [31]:
# тут был мой ключ, который я не хочу палить FRASER_API_KEY = ""
os.environ["FRASER_API_KEY"] = FRASER_API_KEY

In [32]:
FRASER_BASE = "https://fraser.stlouisfed.org/api"

class FraserClient:
    def __init__(self, api_key: str, rpm_limit: int = 30):
        self.s = requests.Session()
        self.s.headers.update({"X-API-Key": api_key})
        self.min_delay = 60.0 / max(1, rpm_limit)
        self._last = 0.0

    def _throttle(self):
        now = time.time()
        dt = now - self._last
        if dt < self.min_delay:
            time.sleep(self.min_delay - dt)
        self._last = time.time()

    def get(self, path: str, params=None, retries: int = 6, backoff_base: float = 0.8):
        url = FRASER_BASE.rstrip("/") + "/" + path.lstrip("/")
        params = params or {}
        last_err = None

        for attempt in range(retries):
            self._throttle()
            try:
                r = self.s.get(url, params=params, timeout=60)
                if r.status_code in (429, 500, 502, 503, 504):
                    last_err = requests.HTTPError(f"{r.status_code} for {r.url}", response=r)
                    raise last_err

                r.raise_for_status()
                return r.json()

            except (requests.Timeout, requests.ConnectionError, requests.HTTPError) as e:
                last_err = e
                if isinstance(e, requests.HTTPError) and getattr(e.response, "status_code", None) not in (429, 500, 502, 503, 504):
                    raise
                sleep_s = backoff_base * (2 ** attempt) + random.random() * 0.25
                time.sleep(sleep_s)

        raise last_err

client = FraserClient(FRASER_API_KEY)

In [33]:
ROOT_TITLE_ID = 3763

def _extract_list(data: dict) -> list:
    for k in ("items", "records", "results", "data"):
        v = data.get(k)
        if isinstance(v, list):
            return v
    return []

def safe_first(x, default=None):
    return x[0] if isinstance(x, list) and x else default

def get_record_id(meta: dict):
    ri = meta.get("recordInfo", {})
    rid = safe_first(ri.get("recordIdentifier")) or meta.get("id")
    try:
        return int(rid)
    except Exception:
        return None

def get_record_type(meta: dict) -> str:
    ri = meta.get("recordInfo", {})
    return (ri.get("recordType") or meta.get("recordType") or "").lower()

def search_all_by_facet(facet: str, limit: int = 100, heartbeat_sec: int = 15):
    page = 1
    out = []
    last_beat = time.time()

    pbar = tqdm(desc=f"facets[]={facet}", unit="page", dynamic_ncols=True)
    while True:
        if time.time() - last_beat >= heartbeat_sec:
            print(f"search: facet={facet} page={page} collected={len(out)}")
            last_beat = time.time()

        params = {
            "page": page,
            "limit": limit,
            "format": "json",
            "facets[]": [facet],
            "fields": "recordInfo!titleInfo!originInfo!name!location!identifier"
        }
        data = client.get("/search/", params=params)
        items = _extract_list(data)

        if not items:
            break

        out.extend(items)
        pbar.update(1)
        pbar.set_postfix({"page": page, "collected": len(out)})

        if len(items) < limit:
            break
        page += 1

    pbar.close()
    return out

res_series = search_all_by_facet(f"series:{ROOT_TITLE_ID}", limit=100)
res_partof = search_all_by_facet(f"partOf:{ROOT_TITLE_ID}", limit=100)
# merge
all_res = res_series + res_partof

records = []
seen = set()
for r in all_res:
    rid = get_record_id(r)
    rt = get_record_type(r)
    if rid is None or not rt:
        continue
    if "item" in rt:
        pair = ("item", rid)
    elif "title" in rt:
        pair = ("title", rid)
    else:
        continue
    if pair not in seen:
        seen.add(pair)
        records.append(pair)

print("records:", len(records))

facets[]=series:3763: 0page [00:00, ?page/s]

search: facet=series:3763 page=9 collected=800
search: facet=series:3763 page=17 collected=1600
search: facet=series:3763 page=23 collected=2200
search: facet=series:3763 page=31 collected=3000
search: facet=series:3763 page=39 collected=3800
search: facet=series:3763 page=47 collected=4600


facets[]=partOf:3763: 0page [00:00, ?page/s]

records: 4532


In [35]:
print("All:", len(records))
print("Unique:", len(set(records)))

All: 4532
Unique: 4532


In [None]:
def norm_space(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def extract_title(meta: dict) -> str:
    ti = meta.get("titleInfo") or []
    if isinstance(ti, list) and ti:
        t = ti[0].get("title")
        if isinstance(t, str):
            return norm_space(t)
    return ""

def extract_date(meta: dict):
    oi = meta.get("originInfo") or {}
    candidates = []
    for k in ["dateIssued", "dateCreated", "dateOther", "dateCaptured"]:
        v = oi.get(k)
        if v:
            candidates += v if isinstance(v, list) else [v]
    for c in candidates:
        if isinstance(c, dict) and "$" in c:
            c = c["$"]
        if isinstance(c, str):
            try:
                return dtparse(c, fuzzy=True).date().isoformat()
            except Exception:
                pass
    return None

def extract_speakers(meta: dict) -> str:
    names = meta.get("name") or []
    out = []
    for n in names if isinstance(names, list) else [names]:
        role = n.get("role")
        parts = n.get("namePart")
        txt_parts = []
        if isinstance(parts, list):
            for p in parts:
                if isinstance(p, str):
                    txt_parts.append(p)
                elif isinstance(p, dict) and "$" in p and p.get("@type") != "date":
                    txt_parts.append(p["$"])
        elif isinstance(parts, str):
            txt_parts.append(parts)

        name_txt = norm_space(" ".join(txt_parts))
        if name_txt:
            out.append((role, name_txt))

    creators = [nm for role, nm in out if role == "creator"]
    return "; ".join(sorted(set(creators or [nm for _, nm in out])))

def extract_urls(meta: dict):
    urls = []
    loc = meta.get("location") or {}
    for u in (loc.get("url") or []):
        if isinstance(u, str):
            urls.append(u)
        elif isinstance(u, dict) and "$" in u:
            urls.append(u["$"])

    def walk(x):
        if isinstance(x, dict):
            for k, v in x.items():
                if k in ("url", "href") and isinstance(v, str):
                    urls.append(v)
                walk(v)
        elif isinstance(x, list):
            for i in x:
                walk(i)

    walk(meta.get("identifier"))
    walk(meta.get("relatedItem"))
    walk(meta.get("extension"))

    urls = [u.replace("\\/", "/") for u in urls if isinstance(u, str) and u.startswith("http")]
    return sorted(set(urls))

def pick_pdf_urls(urls):
    return [u for u in urls if u.lower().endswith(".pdf") or ".pdf?" in u.lower()]

def fetch_meta(rtype: str, rid: int) -> dict:
    fields = "titleInfo!originInfo!name!identifier!relatedItem!location!recordInfo!note!abstract"
    return client.get(f"/{rtype}/{rid}", params={"fields": fields, "format": "json"})

rows = []
last_beat = time.time()

pbar = tqdm(records, desc="FRASER Data", unit="doc", dynamic_ncols=True)
for i, (rtype, rid) in enumerate(pbar, 1):
    if time.time() - last_beat >= 20:
        last_beat = time.time()

    meta = fetch_meta(rtype, rid)
    urls = extract_urls(meta)

    rows.append({
        "fraser_record_type": rtype,
        "fraser_id": rid,
        "title": extract_title(meta),
        "date": extract_date(meta),
        "speaker": extract_speakers(meta),
        "pdf_urls": " | ".join(pick_pdf_urls(urls)),
        "all_urls": " | ".join(urls),
    })

df = pd.DataFrame(rows).sort_values(["date", "title"], na_position="last").reset_index(drop=True)

In [36]:
def norm_space(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def safe_first_list(dct, key):
    v = (dct or {}).get(key)
    return v[0] if isinstance(v, list) and v else ""

def get_record_id(rec: dict):
    rid = safe_first_list(rec.get("recordInfo", {}), "recordIdentifier") or rec.get("id")
    try:
        return int(rid)
    except Exception:
        return None

def get_record_type(rec: dict):
    return (rec.get("recordInfo", {}).get("recordType") or rec.get("recordType") or "").lower()

def extract_title(rec: dict) -> str:
    ti = rec.get("titleInfo") or []
    if isinstance(ti, list) and ti:
        t = ti[0].get("title")
        if isinstance(t, str):
            return norm_space(t)
    return ""

def extract_date(rec: dict):
    oi = rec.get("originInfo") or {}
    for k in ("sortDate", "dateIssued", "dateCreated", "dateOther", "dateCaptured"):
        v = oi.get(k) if isinstance(oi, dict) else None
        if not v:
            continue
        try:
            return dtparse(v, fuzzy=True).date().isoformat()
        except Exception:
            pass
    return None

def extract_speaker(rec: dict) -> str:
    names = rec.get("name") or []
    out = []
    for n in names if isinstance(names, list) else [names]:
        if not isinstance(n, dict):
            continue
        role = n.get("role")
        parts = n.get("namePart")
        txt_parts = []
        if isinstance(parts, list):
            for p in parts:
                if isinstance(p, str):
                    txt_parts.append(p)
                elif isinstance(p, dict) and "$" in p and p.get("@type") != "date":
                    txt_parts.append(p["$"])
        elif isinstance(parts, str):
            txt_parts.append(parts)
        name_txt = norm_space(" ".join(txt_parts))
        if name_txt:
            out.append((role, name_txt))
    creators = [nm for role, nm in out if role == "creator"]
    vals = creators or [nm for _, nm in out]
    return "; ".join(sorted(set(vals)))

def extract_links(rec: dict):
    loc = rec.get("location") or {}
    return {
        "page_url": safe_first_list(loc, "url"),
        "pdf_url": safe_first_list(loc, "pdfUrl"),
        "text_url": safe_first_list(loc, "textUrl"),
    }

all_res = (res_series if "res_series" in globals() else []) + (res_partof if "res_partof" in globals() else [])

rows = []
seen = set()
for rec in all_res:
    rt = get_record_type(rec)
    rid = get_record_id(rec)
    if rt != "item" or rid is None:
        continue
    if rid in seen:
        continue
    seen.add(rid)
    links = extract_links(rec)
    rows.append({
        "fraser_id": rid,
        "title": extract_title(rec),
        "date": extract_date(rec),
        "speaker": extract_speaker(rec),
        "page_url": links["page_url"],
        "pdf_url": links["pdf_url"],
        "text_url": links["text_url"],
    })

df = pd.DataFrame(rows).sort_values(["date", "title"], na_position="last").reset_index(drop=True)
df.head(10)

Unnamed: 0,fraser_id,title,date,speaker,page_url,pdf_url,text_url
0,475382,Banking and Currency Reform,1913-09-13,,https://fraser.stlouisfed.org/title/statements...,https://fraser.stlouisfed.org/docs/historical/...,https://fraser.stlouisfed.org/files/text/histo...
1,475383,Conference Report on Currency Bill,1913-12-22,,https://fraser.stlouisfed.org/title/statements...,https://fraser.stlouisfed.org/docs/historical/...,https://fraser.stlouisfed.org/files/text/histo...
2,475384,Location of Federal Reserve Banks,1914-04-08,,https://fraser.stlouisfed.org/title/statements...,https://fraser.stlouisfed.org/docs/historical/...,https://fraser.stlouisfed.org/files/text/histo...
3,475517,Remarks before the Harvard Club of Chicago,1915-01-22,,https://fraser.stlouisfed.org/title/statements...,https://fraser.stlouisfed.org/docs/historical/...,https://fraser.stlouisfed.org/files/text/histo...
4,473743,The Federal Reserve Act and the Place it is to...,1915-02-16,,https://fraser.stlouisfed.org/title/statements...,https://fraser.stlouisfed.org/docs/historical/...,https://fraser.stlouisfed.org/files/text/histo...
5,475805,The Federal Reserve Act and the Place It Is to...,1915-03-16,,https://fraser.stlouisfed.org/title/statements...,https://fraser.stlouisfed.org/docs/historical/...,https://fraser.stlouisfed.org/files/text/histo...
6,475444,Address at a Meeting of the American Bankers A...,1915-09-08,,https://fraser.stlouisfed.org/title/statements...,https://fraser.stlouisfed.org/docs/historical/...,https://fraser.stlouisfed.org/files/text/histo...
7,9170,Address before the Twin City Bankers' Club of ...,1915-10-22,,https://fraser.stlouisfed.org/title/statements...,https://fraser.stlouisfed.org/docs/historical/...,https://fraser.stlouisfed.org/files/text/histo...
8,475385,Financial Freedom Under Woodrow Wilson,1916-04-13,,https://fraser.stlouisfed.org/title/statements...,https://fraser.stlouisfed.org/docs/historical/...,https://fraser.stlouisfed.org/files/text/histo...
9,9177,How the Federal Reserve System is Meeting the ...,1916-05-24,,https://fraser.stlouisfed.org/title/statements...,https://fraser.stlouisfed.org/docs/historical/...,https://fraser.stlouisfed.org/files/text/histo...


In [37]:
df.to_csv("fraser_speeches_3763.csv", index=False)

In [38]:
base = "https://www.federalreserve.gov"
start = 1996
end = datetime.now().year 
delay = 0.25             # чтобы не убить сайт

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; frb-speech-scraper/1.0; +https://example.com)"})

In [40]:
months = r"(January|February|March|April|May|June|July|August|September|October|November|December)"
date = re.compile(rf"\b{months}\s+\d{{1,2}},\s+\d{{4}}\b", re.I)

def fetch_html(url: str) -> str:
    r = session.get(url, timeout=60)
    r.raise_for_status()
    time.sleep(delay)
    return r.text

def parse_year_page(year: int) -> list[dict]:
    url = f"{base}/newsevents/speech/{year}speech.htm"
    html = fetch_html(url)
    soup = BeautifulSoup(html, "lxml")
    rows = []
    for li in soup.find_all("li"):
        a = li.find("a", href=True)
        if not a:
            continue
        href = a["href"].strip()
        if "speech.htm" in href and href.endswith("speech.htm"):
            continue
        if "newsevents/speech/" in href and href.endswith("/"):
            continue

        title = a.get_text(" ", strip=True)
        if not title:
            continue

        block_text = li.get_text("\n", strip=True)
        m = date.search(block_text)
        if not m:
            continue
        date_str = m.group(0)

        try:
            date_iso = dtparse(date_str, fuzzy=True).date().isoformat()
        except Exception:
            date_iso = ""

        lines = [x.strip() for x in block_text.split("\n") if x.strip()]

        speaker = ""
        at_line = ""
        try:
            title_idx = next(i for i, ln in enumerate(lines) if title in ln)
        except StopIteration:
            title_idx = None

        if title_idx is not None:
            if title_idx + 1 < len(lines):
                speaker = lines[title_idx + 1]
            if title_idx + 2 < len(lines):
                at_line = lines[title_idx + 2] if lines[title_idx + 2].lower().startswith(("at ", "before ", "in ")) else ""
        else:
            speaker = lines[2] if len(lines) > 2 else ""
            at_line = lines[3] if len(lines) > 3 else ""

        speech_url = urljoin(url, href)

        rows.append({
            "year": year,
            "date": date_iso,
            "date_raw": date_str,
            "title": title,
            "speaker": speaker,
            "at": at_line,
            "speech_url": speech_url,
        })

    return rows

In [41]:
def scrape_years(start_year=start, end_year=end, heartbeat_sec=15):
    all_rows = []
    last_beat = time.time()
    years = list(range(start_year, end_year + 1))
    for y in tqdm(years, desc="years", unit="year", dynamic_ncols=True):
        try:
            rows = parse_year_page(y)
            all_rows.extend(rows)
        except requests.HTTPError as e:
            if getattr(e.response, "status_code", None) in (404, 410):
                continue
            raise

        if time.time() - last_beat >= heartbeat_sec:
            print(f"years_done={y-start_year+1}/{len(years)} speeches={len(all_rows)}")
            last_beat = time.time()

    df = pd.DataFrame(all_rows)
    if not df.empty:
        df = df.sort_values(["date", "title"], na_position="last").reset_index(drop=True)
    return df

df_index = scrape_years()
print("total:", len(df_index))
df_index.head(10)

years:   0%|                                           | 0/30 [00:00<?, ?year/s]

years_done=22/30 speeches=645
total: 645


Unnamed: 0,year,date,date_raw,title,speaker,at,speech_url
0,1996,1996-06-13,"June 13, 1996",Bank supervision in a world economy,Chairman Alan Greenspan,At the International Conference of Banking Sup...,https://www.federalreserve.gov/boarddocs/speec...
1,1996,1996-06-18,"June 18, 1996",Developments in electronic money and banking,"Governor Edward W. Kelley, Jr.","At the CyberPayments '96 Conference, Dallas, T...",https://www.federalreserve.gov/boarddocs/speec...
2,1996,1996-09-08,"September 8, 1996",Monetary policy objectives and strategy,Governor Laurence H. Meyer,At the National Association of Business Econom...,https://www.federalreserve.gov/boarddocs/speec...
3,1996,1996-09-19,"September 19, 1996",Regulation and electronic payment systems,Chairman Alan Greenspan,At the U.S. Treasury Conference on Electronic ...,https://www.federalreserve.gov/boarddocs/speec...
4,1996,1996-10-02,"October 2, 1996",Small business is big business,Governor Lawrence B. Lindsey,"At the Small Business Development Conference, ...",https://www.federalreserve.gov/boarddocs/speec...
5,1996,1996-10-05,"October 5, 1996","Bank supervision, regulation, and risk",Chairman Alan Greenspan,At the Annual Convention of the American Banke...,https://www.federalreserve.gov/boarddocs/speec...
6,1996,1996-10-07,"October 7, 1996",U.S. Treasury securities market: Lessons from ...,Chairman Alan Greenspan,At the Annual Public Service Awards Dinner of ...,https://www.federalreserve.gov/boarddocs/speec...
7,1996,1996-10-09,"October 9, 1996",Here we go again?,Governor Lawrence B. Lindsey,At the Community Development Lending Conferenc...,https://www.federalreserve.gov/boarddocs/speec...
8,1996,1996-10-11,"October 11, 1996",How to grow faster,Governor Lawrence B. Lindsey,"At the Atlantic Economic Society, Washington, ...",https://www.federalreserve.gov/boarddocs/speec...
9,1996,1996-10-16,"October 16, 1996",Technological advances and productivity,Chairman Alan Greenspan,At the 80th Anniversary Awards Dinner of The C...,https://www.federalreserve.gov/boarddocs/speec...


In [42]:
df_index.to_csv("frb_speeches_index_1996.csv", index=False)