Next Step:
- Download the data from [here](https://www.govinfo.gov/bulkdata/CFR)

- ⬇️ Downloads XML data files per agency reference (full titles or chapters), and stores raw regulation text

In [27]:
import pandas as pd
import os

# Load the flattened agency list generated in notebook 01
csv_path = "../archive/flattened_agencies_list_2025-04-06.csv"

if os.path.exists(csv_path):
    agencies_df = pd.read_csv(csv_path)
    print(f"✅ Loaded {len(agencies_df)} rows from {csv_path}")
else:
    raise FileNotFoundError(f"❌ Could not find {csv_path}")


✅ Loaded 487 rows from ../archive/flattened_agencies_list_2025-04-06.csv


In [31]:
def build_url(row, date="2025-03-27"):
    title = int(row["title"])
    chapter = row.get("chapter")
    if pd.notnull(chapter):
        print(f"HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/{date}/title-{title}.xml?chapter={chapter}")
        return f"https://www.ecfr.gov/api/versioner/v1/full/{date}/title-{title}.xml?chapter={chapter}"
    else:
        print(f"HERE2 ---{title}---{chapter}--- https://www.ecfr.gov/api/versioner/v1/full/{date}/title-{title}.xml")
        return f"https://www.ecfr.gov/api/versioner/v1/full/{date}/title-{title}.xml"

agencies_df["download_url"] = agencies_df.apply(build_url, axis=1)

#agencies_df[['name', 'title', 'chapter', 'download_url']].head(len(agencies_df))

HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-1.xml?chapter=III
HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-36.xml?chapter=VIII
HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-5.xml?chapter=LXXXIII
HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-22.xml?chapter=XV
HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-48.xml?chapter=57
HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-22.xml?chapter=V
HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-48.xml?chapter=19
HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-2.xml?chapter=IV
HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-5.xml?chapter=LXXIII
HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-7.xml?chapter=XVI
HERE1 ---- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-7.xml?chapter=XX
HERE1 ---- http

In [38]:
# After agencies_df has been built and cleaned
def build_url(row, date="2025-03-27"):
    title = row.get("title")
    if pd.isnull(title):
        return None

    title = int(title)
    base = f"https://www.ecfr.gov/api/versioner/v1/full/{date}/title-{title}.xml"

    if pd.notnull(row.get("part")):
        part = int(row["part"])
        print(f"HERE5 -------------------- {base}?part={part}")
        return f"{base}?part={part}"
    elif pd.notnull(row.get("subchapter")):
        subchapter = str(row["subchapter"]).strip()
        print(f"HERE4 --------------- {base}?subchapter={subchapter}")
        return f"{base}?subchapter={subchapter}"
    elif pd.notnull(row.get("chapter")):
        chapter = str(row["chapter"]).strip()
        print(f"HERE3 ---------- {base}?chapter={chapter}")
        return f"{base}?chapter={chapter}"
    elif pd.notnull(row.get("subtitle")):
        subtitle = str(row["subtitle"]).strip()
        print(f"HERE2 ------ {base}?subtitle={subtitle}")
        return f"{base}?subtitle={subtitle}"
    else:
        print(f"HERE1 -- {base}")
        return base

    # Optional: check if it exists
    try:
        r = requests.head(url, timeout=10)
        if r.status_code == 200:
            return url
        else:
            print(f"⚠️ Invalid URL: {url} → Status {r.status_code}")
            return None
    except Exception as e:
        print(f"❌ Error checking URL: {url} → {e}")
        return None




# Apply to each row
agencies_df["download_url"] = agencies_df.apply(build_url, axis=1)

agencies_df[['name', 'download_url']].head()


HERE3 ---------- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-1.xml?chapter=III
HERE3 ---------- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-36.xml?chapter=VIII
HERE3 ---------- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-5.xml?chapter=LXXXIII
HERE3 ---------- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-22.xml?chapter=XV
HERE3 ---------- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-48.xml?chapter=57
HERE3 ---------- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-22.xml?chapter=V
HERE3 ---------- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-48.xml?chapter=19
HERE3 ---------- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-2.xml?chapter=IV
HERE3 ---------- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-5.xml?chapter=LXXIII
HERE3 ---------- https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-7.xml?chapter=XVI
HERE3 ---------- https://www.ecfr.gov/api/v

Unnamed: 0,name,download_url
0,Administrative Conference of the United States,https://www.ecfr.gov/api/versioner/v1/full/202...
1,Advisory Council on Historic Preservation,https://www.ecfr.gov/api/versioner/v1/full/202...
2,Special Inspector General for Afghanistan Reco...,https://www.ecfr.gov/api/versioner/v1/full/202...
3,African Development Foundation,https://www.ecfr.gov/api/versioner/v1/full/202...
4,African Development Foundation,https://www.ecfr.gov/api/versioner/v1/full/202...


In [44]:
def is_url_valid(url, timeout=(5, 60)):
    start = time.time()
    try:
        response = requests.head(url, timeout=timeout)
        elapsed = time.time() - start
        if response.status_code == 200:
            print(f"✅ {url} (200) - {elapsed:.2f}s")
            return True
        else:
            print(f"⚠️ {url} ({response.status_code}) - {elapsed:.2f}s")
            return False
    except requests.RequestException as e:
        elapsed = time.time() - start
        print(f"❌ {url} (Error) - {elapsed:.2f}s\n   → {e}")
        return False


In [45]:
agencies_df["download_url"].apply(is_url_valid)

✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-1.xml?chapter=III (200) - 0.16s
✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-36.xml?chapter=VIII (200) - 0.27s
✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-5.xml?chapter=LXXXIII (200) - 5.03s
✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-22.xml?chapter=XV (200) - 4.97s
✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-48.xml?chapter=57 (200) - 5.26s
✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-22.xml?chapter=V (200) - 0.22s
✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-48.xml?chapter=19 (200) - 5.30s
✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-2.xml?chapter=IV (200) - 4.65s
✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-5.xml?chapter=LXXIII (200) - 0.31s
✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-27/title-7.xml?chapter=XVI (200) - 5.54s
✅ https://www.ecfr.gov/api/versioner/v1/full/2025-03-

0      True
1      True
2      True
3      True
4      True
       ... 
482    True
483    True
484    True
485    True
486    True
Name: download_url, Length: 487, dtype: bool