In [None]:
import pandas as pd
import requests
import uuid

BASE_URL = 'https://www.ecfr.gov'
TITLES_PATH = '/api/versioner/v1/titles.json'
REQUEST_DATE = '2025-02-10'

def gen_structure_url(title):
    return f'{BASE_URL}/api/versioner/v1/structure/{REQUEST_DATE}/title-{title}.json'

def get_structure(title):
    structure_url = gen_structure_url(title)
    response = requests.get(structure_url)
    if response.status_code != 200:
        raise RuntimeError(f'Failed to get structure for title {title}: {response.status_code}, body: {response.text}')
    
    structure_json = response.json()
    return structure_json

def gen_rows(structure):
    rows = []

    def traverse(node, parent_id=None):
        row = {k: v for k, v in node.items() if k != "children"}
        composite_parent_id = parent_id + '_' if parent_id else ''

        id = ''
        try:
            id = row["identifier"]
        except KeyError:
            id = uuid.uuid4().hex[:16]
            row["identifier"] = id

        row["parent_id"] = composite_parent_id
        rows.append(row)

        for child in node.get("children", []):
            traverse(child, f'{composite_parent_id}{id}.{row["type"]}')

    traverse(structure)

    return rows


In [None]:
url = BASE_URL + TITLES_PATH
response = requests.get(url)

if response.status_code != 200:
    raise RuntimeError(f'Failed to get titles: {response.status_code}')

parsed = response.json()

rows = []

for title_obj in parsed['titles']:
    title_id = title_obj['number']
    if title_id == 35:
        continue  # skip title 35 b/c it is empty
    print(f'Parsing title {title_id}')
    structure = get_structure(title_id)
    local_rows = gen_rows(structure)
    rows.extend(local_rows)

df_structure = pd.DataFrame(rows)

df_structure = df_structure.set_index("identifier")
df_structure.head(10)

In [None]:

def parent_key_to_ids(parent_key):
    ids = {}
    split_parent_key = parent_key.split('_')
    for i in range(len(split_parent_key)):
        pair = split_parent_key[i].split('.')
        if len(pair) != 2:
            continue
        value = pair[0]
        key = pair[1]
        ids[key] = value

    return ids


def gen_section_rows(df_structure):
    section_rows = []
    for _, row in df_structure.iterrows():
        if row['type'] == 'section':
            parent_id = row['parent_id']
            split_parent_id = parent_id.split('_')
            split_parent_id = split_parent_id[:-1]

            # split_parent_id will look like [title, chapter, part ] - add these to the row
            # if third element is a number, then it's a subchapter
            # if there is a fifth element, then it's a subpart
            section_row = row.to_dict()
            addendum = parent_key_to_ids(parent_id)
            section_row.update(addendum)
            section_row['id'] = uuid.uuid4().hex[:16]

            # add the row to the list of rows
            section_rows.append(section_row)
            if len(section_rows) % 1000 == 0:
                print(f'Processed {len(section_rows)} section rows of {len(df_structure)}')

    # init dataframe, ensure that chapter is string
    return pd.DataFrame(section_rows).astype({'chapter': 'string', 'subchapter': 'string', 'part': 'string', 'subpart': 'string', 'title': 'string', 'subtitle': 'string'})

df_sections = gen_section_rows(df_structure)
df_sections.to_csv('data/ecfr_sections.csv', index=False)

df_sections.drop(columns=['generated_id', 'volumes', 'descendant_range', 'reserved'], inplace=True)

# TODO: make sure getting all of section rows
    
df_sections

In [None]:
# get urls for each section

def gen_content_url(title, subtitle, chapter, subchapter, part, subpart):
    url = f'{BASE_URL}/api/versioner/v1/full/{REQUEST_DATE}/title-{title}.xml'
    params = {
        "title": title,
        "part": part,
    }

    if chapter and chapter != '':
        params["chapter"] = chapter
    if subtitle and subtitle != '':
        params["subtitle"] = subtitle
    if subchapter and subchapter != '':
        params["subchapter"] = subchapter
    if subpart and subpart != '':
        params["subpart"] = subpart

    query_string = '&'.join([f'{k}={v}' for k, v in params.items()])
    return f'{url}?{query_string}'
    

df_sections = df_sections.fillna('')
urls = []
for i, row in df_sections.iterrows():
    if i % 1000 == 0:
        print(f'Processed {i} section rows of {len(df_sections)}')
    title = row['title']
    subtitle = row['subtitle']
    chapter = row['chapter']
    subchapter = row['subchapter']
    part = row['part']
    subpart = row['subpart']

    url = gen_content_url(title, subtitle, chapter, subchapter, part, subpart)
    urls.append({'id': row['id'], 'url': url})

# to csv
df_urls = pd.DataFrame(urls, columns=['id','url'])
