In [0]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 25.8MB/s 
[?25hCollecting py4j==0.10.7 (from pyspark)
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 45.0MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130387 sha256=753732181d8553ea3ff9ee0033f3714e3960fbc6d325c2e54eb7d09bff7c613b
  Stored in directory: /root/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 p

In [0]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup as bs 
import pyspark

from pyspark import SparkConf,  SparkContext
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *
import sys
sys.setrecursionlimit(100000)

In [0]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sc)

In [0]:
def ecfr_parser(title_num):
    """
    Simply enter the title number of the eCFR you'd like to parse (e.g. 16).
    This parser does not work on eCFR Titles 2, 3, 6, 13, 34, 35, and 38.
    """

    SCHEMA = StructType([StructField('chapter', StringType(), True),
                         StructField('chapter_title', StringType(), True),
                         StructField('subchapter', StringType(), True),
                         StructField('subchapter_title', StringType(), True),
                         StructField('part', StringType(), True),
                         StructField('part_title', StringType(), True),
                         StructField('section', StringType(), True),
                         StructField('section_title', StringType(), True),
                         StructField('section_text', StringType(), True)])

    xml = requests.get("https://www.govinfo.gov/bulkdata/ECFR/title-"+str(title_num)+"/ECFR-title"+str(title_num)+".xml")
    soup = bs(xml.content, 'xml') 
    
    list_of_dicts = []               
 
    chapters = soup.find_all('DIV3')
    for chapter in chapters:
      chapter_num = chapter.attrs['N']
      chapter_title = chapter.find('HEAD').text

      subchapters = chapter.find_all('DIV4')
      for subchapter in subchapters:
          subchapter_num = subchapter.attrs['N']
          subchapter_title = subchapter.find('HEAD').text

          parts = subchapter.find_all('DIV5')
          for part in parts:
              part_num = part.attrs['N']
              part_title = part.find('HEAD').text

              sections = part.find_all('DIV8') 
              for section in sections:
                  section_num = section.attrs['N'][2:]
                  section_title = section.find('HEAD').text
                  section_text = section.find_all('P')

                  list_of_dicts.append({'chapter': chapter_num, 'chapter_title': chapter_title,
                                        'subchapter': subchapter_num, 'subchapter_title': subchapter_title, 
                                        'part': part_num, 'part_title': part_title, 'section': section_num, 
                                        'section_title': section_title, 'section_text': str(section_text)})

    df = spark.createDataFrame(list_of_dicts, SCHEMA)

    regex = "\[+|\]+|<[A-Z]+>+|<\/[A-Z]+>+|\\n+"
    df = df.withColumn('section_text', F.regexp_replace(df.section_text, regex,''))

    df = df.withColumn('id', F.md5(F.concat('chapter_title', 'subchapter_title', 'part_title', 'section_title')))
                
    return df.show() #using .show() only for the purposes of displaying my results

In [0]:
ecfr_parser(16)

+-------+--------------------+----------+--------------------+----+--------------------+-------+--------------------+--------------------+--------------------+
|chapter|       chapter_title|subchapter|    subchapter_title|part|          part_title|section|       section_title|        section_text|                  id|
+-------+--------------------+----------+--------------------+----+--------------------+-------+--------------------+--------------------+--------------------+
|      I| CHAPTER I - FEDE...|         A|SUBCHAPTER A - OR...|   0|PART 0 - ORGANIZA...|    0.1|§ 0.1   The Commi...|The Federal Trade...|2fe0b83177937e17d...|
|      I| CHAPTER I - FEDE...|         A|SUBCHAPTER A - OR...|   0|PART 0 - ORGANIZA...|    0.2|§ 0.2   Official ...|The principal off...|aff4c7f07a8f5c9c5...|
|      I| CHAPTER I - FEDE...|         A|SUBCHAPTER A - OR...|   0|PART 0 - ORGANIZA...|    0.3|      § 0.3   Hours.|Principal and fie...|b20c61abfc2b34ccc...|
|      I| CHAPTER I - FEDE...|         A