In [1]:
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

This notebook generate a summary table containing:
    1. Name of the regulation
    2. Docket ID
    3. RIN
    4. Summary of the regulation
Input is a table of docket ids generated from all_docket_id notebook

### Input Table

In [2]:
df = pd.read_csv('docket_id_CFPB_all.csv') # read in the table of docket ids

### Functions

In [17]:
def get_bs_object(input_url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    capabilities = DesiredCapabilities.CHROME.copy()
    capabilities['acceptSslCerts'] = True 
    capabilities['acceptInsecureCerts'] = True

    driver = webdriver.Chrome(options = chrome_options,executable_path='./chromedriver',desired_capabilities=capabilities)
    driver.get(input_url)

    time.sleep(5)

    html=driver.page_source
    result_page=BeautifulSoup(html,"html.parser")
    return result_page

In [18]:
def generate_row(docket_id):    
    url="https://beta.regulations.gov/docket/" 
    homepage_url = url + docket_id

    result_page = get_bs_object(homepage_url)
    
    try:
        name = result_page.find('h1', {'class':'h3 mt-0 mb-1 font-weight-bold js-title'}).get_text()
    except:
        name = None
        
    try:
        Docket_ID = result_page.find('div',{'class':"card-block py-0 pl-2 small text-muted"}).find('p', {'class':'mb-0'}).get_text()
    except:
        Docket_ID = docket_id
        
    def get_RIN(result_page):
        try:
            if ('RIN' in 
            result_page.find('li', 
                             {'class':'list-group-item text-muted small'}).find('label', {'class': 
                                                                                          'control-label'}).get_text().strip()):
                return result_page.find('li', 
                             {'class':'list-group-item text-muted small'}).find('p', {'class':'mb-0'}).get_text()
        except:
            return 'Not Assigned'
    
    RIN = get_RIN(result_page)
    
    def get_summary(result_page):
        texts = result_page.find_all('p')
        for text in texts:
            text = text.get_text().strip()
            if (not text.startswith('Beta Evaluation') and 
                not text.startswith('An official') and 
                not text.startswith('Created by') and 
                not text.startswith('This count refers to the total comment') and 
                not text.startswith('We are currently in') and 
                not text.startswith('Regulations.gov') and
                not text.startswith('Docket') and len(text) > 15):
                return text
    try:
        summary = get_summary(result_page)
    except:
        summary = None
    
    return [name, Docket_ID, RIN, summary]

In [19]:
def generate_table(df):
    res = []
    for i in range(len(df)):
        docket_id= df.iloc[i,0]
        res.append(generate_row(docket_id))
    return res

### Run the code

In [20]:
res_list = generate_table(df)

In [21]:
df_res = pd.DataFrame(res_list, columns = ['Name', 'Docket_ID', 'RIN', 'Summary'])

### Result

In [22]:
df_res

Unnamed: 0,Name,Docket_ID,RIN,Summary
0,Electronic Fund Transfers (Regulation E),CFPB-2014-0008,3170-AA45,The Bureau of Consumer Financial Protection (B...
1,Agency Information Collection Activities: Comm...,CFPB-2015-0037,,In accordance with the Paperwork Reduction Act...
2,PRA Comment Request; MARS (Regulation O); File...,FTC-2019-0087,Not Assigned,The Federal Trade Commission (‘‘FTC’’ or ‘‘Com...
3,Amendments to Federal Mortgage Disclosure Requ...,CFPB-2016-0038,3170-AA61,The Bureau of Consumer Financial Protection (B...
4,Social Network and Citizen Engagement System,CFPB-2011-0014,,System of Records Notices (SCORNs)
...,...,...,...,...
70,Policy To Encourage Trial Disclosure Programs;...,CFPB-2012-0046,,The Bureau of Consumer Financial Protection (B...
71,Amendments to the 2013 Mortgage Rules under th...,CFPB-2014-0033,3170-AA49,The Bureau of Consumer Financial Protection (B...
72,Generic 60-Day PRA Notice,CFPB-2011-0033,,Paper Reduction Act; PRA
73,Regulation Z Truth in Lending Act (TILA),CFPB-2011-0031,Not Assigned,Regulation Z; TILA; Truth in Act


In [23]:
df_res.to_excel('Summary_CFPB_1.xlsx', index = False)