In [1]:
import pandas as pd
import requests
import re

#importing Pandas to create a DataFrame, requests to send requests, and re to work with regular expressions

The sectioning of content is managed through the use of <DIV#> elements, where # represents a digit such as 1, 2, 3, etc. Below is the tag key for the elements we will access:
- DIV3 => TYPE="CHAPTER"
- DIV4 => TYPE="SUBCHAP"
- DIV5 => TYPE="PART"
- DIV8 => TYPE="SECTION"

[tag key source](https://github.com/usgpo/bulk-data/blob/master/ECFR-XML-User-Guide.md)

# Beatiful Soup

In [2]:
from bs4 import BeautifulSoup as bs  #importing BeautifulSoup for parsing

In [3]:
xml_bs = requests.get('https://www.govinfo.gov/bulkdata/ECFR/title-16/ECFR-title16.xml')
#sending a GET request to the specified URL to retrieve its data

soup = bs(xml_bs.content, 'xml')
#saving my data as a BeautifulSoup object, which represents it as a nested data structure

In [4]:
list_of_dicts_bs = []  #creating an empty list that I will iteratively append results to               
 
chapters_bs = soup.find_all('DIV3')                              #finding & saving all DIV3 elements in the soup object
for chapter_bs in chapters_bs:                                   #looping through the chapters_bs object
    chapter_num_bs = chapter_bs.attrs['N']                       #finding & saving all DIV3 attributes that are N
    chapter_title_bs = chapter_bs.find('HEAD').text              #finding & saving the text of the HEAD tags

    subchapters_bs = chapter_bs.find_all('DIV4')                 #finding & saving all DIV4 elements in chapter_bs
    for subchapter_bs in subchapters_bs:                         #looping through the subchapters_bs object
        subchapter_num_bs = subchapter_bs.attrs['N']             #finding & saving all DIV4 attributes that are 'N'
        subchapter_title_bs = subchapter_bs.find('HEAD').text    #finding & saving the text of the HEAD tags

        parts_bs = subchapter_bs.find_all('DIV5')                #finding & saving all DIV5 elements in subchapter_bs
        for part_bs in parts_bs:                                 #looping through the parts_bs object
            part_num_bs = part_bs.attrs['N']                     #finding & saving all DIV5 attributes that are N
            part_title_bs = part_bs.find('HEAD').text            #finding & saving the text of the HEAD tags

            sections_bs = part_bs.find_all('DIV8')               #finding & saving all DIV8 elements in the part_bs object 
            for section_bs in sections_bs:                       #looping through the sections_bs object
                section_num_bs = section_bs.attrs['N'][2:]       #finding & saving all DIV8 attributes that are N
                section_title_bs = section_bs.find('HEAD').text  #finding & saving the text of the HEAD tags
                section_text_bs = section_bs.find_all('P')       #finding & saving all the content of P tags

                list_of_dicts_bs.append({'chapter': chapter_num_bs, 'chapter_title': chapter_title_bs,
                                      'subchapter': subchapter_num_bs, 'subchapter_title': subchapter_title_bs, 
                                      'part': part_num_bs, 'part_title': part_title_bs, 'section': section_num_bs, 
                                      'section_title': section_title_bs, 'section_text': str(section_text_bs)})
                #appending my results to a dictionary at the section-level

In [5]:
df_bs = pd.DataFrame(data=list_of_dicts_bs, 
                     columns=['chapter', 'chapter_title', 'subchapter', 'subchapter_title', 
                              'part', 'part_title', 'section', 'section_title', 'section_text'])
#saving my results to a Pandas DataFrame & specifying the columns to enforce the order I want

In [6]:
for col in df_bs.columns:
    df_bs[col] = df_bs[col].str.strip()  #looping through my columns to strip any leading/trailing whitespace

regex_bs = "\[+|\]+|<[A-Z]+>+|<\/[A-Z]+>+|\\n+"  #regular expression that matches on xml tags/ASCII characters
df_bs.section_text = df_bs.section_text.str.replace(regex_bs, '')  #replacing any matches on regex_bs with nothing 

In [7]:
df_bs.head()  #returning the head (first 5 rows) of our final dataset

Unnamed: 0,chapter,chapter_title,subchapter,subchapter_title,part,part_title,section,section_title,section_text
0,I,CHAPTER I - FEDERAL TRADE COMMISSION,A,"SUBCHAPTER A - ORGANIZATION, PROCEDURES AND RU...",0,PART 0 - ORGANIZATION,0.1,§ 0.1 The Commission.,The Federal Trade Commission is an independent...
1,I,CHAPTER I - FEDERAL TRADE COMMISSION,A,"SUBCHAPTER A - ORGANIZATION, PROCEDURES AND RU...",0,PART 0 - ORGANIZATION,0.2,§ 0.2 Official address.,The principal office of the Commission is at W...
2,I,CHAPTER I - FEDERAL TRADE COMMISSION,A,"SUBCHAPTER A - ORGANIZATION, PROCEDURES AND RU...",0,PART 0 - ORGANIZATION,0.3,§ 0.3 Hours.,Principal and field offices are open on each b...
3,I,CHAPTER I - FEDERAL TRADE COMMISSION,A,"SUBCHAPTER A - ORGANIZATION, PROCEDURES AND RU...",0,PART 0 - ORGANIZATION,0.4,§ 0.4 Laws administered.,The Commission exercises enforcement and admin...
4,I,CHAPTER I - FEDERAL TRADE COMMISSION,A,"SUBCHAPTER A - ORGANIZATION, PROCEDURES AND RU...",0,PART 0 - ORGANIZATION,0.5,§ 0.5 Laws authorizing monetary claims.,The Commission is authorized to entertain mone...


In [8]:
df_bs.info()  #printing info about my dataset: number of columns & rows, column dtypes, non-null values, memory usage, etc.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Data columns (total 9 columns):
chapter             2026 non-null object
chapter_title       2026 non-null object
subchapter          2026 non-null object
subchapter_title    2026 non-null object
part                2026 non-null object
part_title          2026 non-null object
section             2026 non-null object
section_title       2026 non-null object
section_text        2026 non-null object
dtypes: object(9)
memory usage: 142.5+ KB


# ElementTree

In [9]:
import xml.etree.ElementTree as et  #importing ElementTree for parsing

In [10]:
xml_et = requests.get('https://www.govinfo.gov/bulkdata/ECFR/title-16/ECFR-title16.xml')
#sending a GET request to the specified URL to retrieve its data

root = et.fromstring(xml_et.content)
#fromstring() parses the XML from my response content into an Element, which is the root element of the parsed tree

In [11]:
list_of_dicts_et = []  #creating an empty list that I will iteratively append results to               
 
chapters_et = root.iter('DIV3')                                  #finding & saving all DIV3 elements in the soup object
for chapter_et in chapters_et:                                   #looping through the chapters_et object
    chapter_num_et = chapter_et.attrib['N']                      #finding & saving all DIV3 attributes that are N
    chapter_title_et = chapter_et.find('HEAD').text              #finding & saving the text of the HEAD tags

    subchapters_et = chapter_et.iter('DIV4')                     #finding & saving all DIV4 elements in chapter_et
    for subchapter_et in subchapters_et:                         #looping through the subchapters_et object
        subchapter_num_et = subchapter_et.attrib['N']            #finding & saving all DIV4 attributes that are 'N'
        subchapter_title_et = subchapter_et.find('HEAD').text    #finding & saving the text of the HEAD tags

        parts_et = subchapter_et.iter('DIV5')                    #finding & saving all DIV5 elements in subchapter_et
        for part_et in parts_et:                                 #looping through the parts_et object
            part_num_et = part_et.attrib['N']                    #finding & saving all DIV5 attributes that are N
            part_title_et = part_et.find('HEAD').text            #finding & saving the text of the HEAD tags

            sections_et = part_et.iter('DIV8')                   #finding & saving all DIV8 elements in the part_et object 
            for section_et in sections_et:                       #looping through the sections_et object
                section_num_et = section_et.attrib['N'][2:]      #finding & saving all DIV8 attributes that are N
                section_title_et = section_et.find('HEAD').text  #finding & saving the text of the HEAD tags
                for p in section_et.findall('P'):                #looping through section_et object
                    section_text_et = p.text                     #finding & saving all the text of the P tags

                list_of_dicts_et.append({'chapter': chapter_num_et, 'chapter_title': chapter_title_et,
                                      'subchapter': subchapter_num_et, 'subchapter_title': subchapter_title_et, 
                                      'part': part_num_et, 'part_title': part_title_et, 'section': section_num_et, 
                                      'section_title': section_title_et, 'section_text': str(section_text_et)})
                #appending my results to a dictionary at the section-level

In [12]:
df_et = pd.DataFrame(data=list_of_dicts_et, 
                     columns=['chapter', 'chapter_title', 'subchapter', 'subchapter_title', 
                              'part', 'part_title', 'section', 'section_title', 'section_text'])
#saving my results to a Pandas DataFrame & specifying the columns to enforce the order I want

In [13]:
for col in df_et.columns:
    df_et[col] = df_et[col].str.strip()  #looping through my columns to strip any leading/trailing whitespace

regex_et = "\[+|\]+|<[A-Z]+>+|<\/[A-Z]+>+|\\n+"  #regular expression that matches on xml tags/ASCII characters
df_et.section_text = df_et.section_text.str.replace(regex_et, '')  #replacing any matches of regex_et with nothing 

In [14]:
df_et.head()  #returning the head (first 5 rows) of our final dataset

Unnamed: 0,chapter,chapter_title,subchapter,subchapter_title,part,part_title,section,section_title,section_text
0,I,CHAPTER I - FEDERAL TRADE COMMISSION,A,"SUBCHAPTER A - ORGANIZATION, PROCEDURES AND RU...",0,PART 0 - ORGANIZATION,0.1,§ 0.1 The Commission.,The Federal Trade Commission is an independent...
1,I,CHAPTER I - FEDERAL TRADE COMMISSION,A,"SUBCHAPTER A - ORGANIZATION, PROCEDURES AND RU...",0,PART 0 - ORGANIZATION,0.2,§ 0.2 Official address.,The principal office of the Commission is at W...
2,I,CHAPTER I - FEDERAL TRADE COMMISSION,A,"SUBCHAPTER A - ORGANIZATION, PROCEDURES AND RU...",0,PART 0 - ORGANIZATION,0.3,§ 0.3 Hours.,Principal and field offices are open on each b...
3,I,CHAPTER I - FEDERAL TRADE COMMISSION,A,"SUBCHAPTER A - ORGANIZATION, PROCEDURES AND RU...",0,PART 0 - ORGANIZATION,0.4,§ 0.4 Laws administered.,The Commission exercises enforcement and admin...
4,I,CHAPTER I - FEDERAL TRADE COMMISSION,A,"SUBCHAPTER A - ORGANIZATION, PROCEDURES AND RU...",0,PART 0 - ORGANIZATION,0.5,§ 0.5 Laws authorizing monetary claims.,The Commission is authorized to entertain mone...


In [15]:
df_et.info()  #printing info about my dataset: number of columns & rows, column dtypes, non-null values, memory usage, etc.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Data columns (total 9 columns):
chapter             2026 non-null object
chapter_title       2026 non-null object
subchapter          2026 non-null object
subchapter_title    2026 non-null object
part                2026 non-null object
part_title          2026 non-null object
section             2026 non-null object
section_title       2026 non-null object
section_text        2026 non-null object
dtypes: object(9)
memory usage: 142.5+ KB
