In [1]:
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import os

### This Notebook use selenium download all pdfs in you input a table of docket ids

### Functions

In [3]:
def get_bs_object(input_url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    capabilities = DesiredCapabilities.CHROME.copy()
    capabilities['acceptSslCerts'] = True 
    capabilities['acceptInsecureCerts'] = True

    driver = webdriver.Chrome(options = chrome_options,executable_path='./chromedriver',desired_capabilities=capabilities)
    driver.get(input_url)

    time.sleep(5)

    html=driver.page_source
    result_page=BeautifulSoup(html,"html.parser")
    return result_page

In [4]:
def get_num_pages(document_page):
    return len(document_page.find_all('ul', {'class':'pagination pagination-sm'})[0].find_all('li'))-2

In [6]:
def get_link(document_page, doc_dict):
    for x in document_page.find_all('div', {'class':'card-block'}):
        doc_type = x.find('p').get_text().strip()
        href = x.find('a', {'class':'ember-view'}).get('href')
        link = 'https://beta.regulations.gov' + href
        if doc_type in doc_dict:
            doc_dict[doc_type].append(link)
        else:
            doc_dict[doc_type] = []
            doc_dict[doc_type].append(link)
    return doc_dict

In [8]:
def get_all_links(url, docket_id):
    document_url = url + docket_id + '/document'
    document_page = get_bs_object(document_url)
    
    try:
        num_pages = get_num_pages(document_page)
    except:
        num_pages = 1
    
    doc_dict = {}
    doc_dict = get_link(document_page, doc_dict)
    
    if num_pages > 1:
        for i in range(2, num_pages +1):
            url_next_page = document_url + '?pageNumber={}'.format(i)
            document_next_page = get_bs_object(url_next_page)
            doc_dict = get_link(document_next_page , doc_dict)
            
    return doc_dict

In [9]:
def get_document_url(docket_id):
    url = 'https://beta.regulations.gov/docket/'
    document_url = url + docket_id + '/document'
    document_page = get_bs_object(document_url)
    url_dict = get_all_links(url, docket_id)
    return url_dict

In [14]:
def get_comment_links_onepage(comment_homepage):
    res = []
    for x in comment_homepage.find_all('h3', {'class':'h4 card-title'}):
        href = x.find('a', {'class':'ember-view'}).get('href')
        res.append('https://beta.regulations.gov' + href)
    return res

In [15]:
def get_all_comment_links(url_comment_homepage):
    res_all = []
    for link in url_comment_homepage:
        comment_homepage = get_bs_object(link)
    
        try:    
            num_pages = get_num_pages(comment_homepage)
        except:
            num_pages = 1

        comment_links = get_comment_links_onepage(comment_homepage)

        if num_pages > 1:
            for i in range(2, num_pages +1):
                url_next_page = link  + '?pageNumber={}'.format(i)
                comment_next_page = get_bs_object(url_next_page)
                comment_links.extend(get_comment_links_onepage(comment_next_page))
        res_all.append(comment_links)       
    return res_all

In [16]:
def get_all_comments(proposed_rule):
    url_comment_homepage = []
    for pr in proposed_rule:
        url_comment_homepage.append(pr + '/comment')
    
    comment_links = get_all_comment_links(url_comment_homepage)
    
    return comment_links

In [17]:
def download(download_link, download_dir):
    
    def enable_download_headless(browser,download_dir):
        browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
        params = {'cmd':'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
        browser.execute("send_command", params)

    # instantiate a chrome options object so you can set the size and headless preference
    # some of these chrome options might be uncessary but I just used a boilerplate
    # change the <path_to_download_default_directory> to whatever your default download folder is located
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--verbose')
    chrome_options.add_experimental_option("prefs", {
            "download.default_directory": "<path_to_download_default_directory>",
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing_for_trusted_sources_enabled": False,
            "safebrowsing.enabled": False
    })
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--disable-software-rasterizer')

    # initialize driver object and change the <path_to_chrome_driver> depending on your directory where your chromedriver should be
    driver = webdriver.Chrome(chrome_options=chrome_options, executable_path='./chromedriver')

    # change the <path_to_place_downloaded_file> to your directory where you would like to place the downloaded file
    download_dir = download_dir

    # function to handle setting up headless download
    enable_download_headless(driver, download_dir)

    # get request to target the site selenium is active on
    driver.get(download_link)


In [64]:
def download_all(docket_id, url_dict):
#     url_dict = get_document_url(docket_id)
    code_dict = {}
    file_types = url_dict.keys()
    
    for file_type in file_types:
        code_dict[file_type] = []
        for link in url_dict[file_type]:
            code_dict[file_type].append(link.strip().split('/')[-1])
    
    directory = docket_id
    parent_dir = "./downloads_3/"
    
    path = os.path.join(parent_dir, directory)
#     os.mkdir(path)
    
    for file_type in file_types:
        sub_path = path + '/' + file_type
        os.mkdir(sub_path)
        file_links = url_dict[file_type]
        
        comments = get_all_comments(file_links)

        if comments[0]:
            check_comment = True
            
            c_code = []
            for comment_ in comments:
                res = []
                for c in comment_:
                    res.append(c.strip().split('/')[-1])
                c_code.append(res)
            
            i = 0
            for file_code in code_dict[file_type]:
                download_dir = sub_path + '/' + file_code
                os.mkdir(download_dir)
                download_link = 'https://downloads.regulations.gov/' + file_code  + '/content.pdf'
                download(download_link, download_dir)
                
                download_dir_comment = download_dir + '/comment'
                os.mkdir(download_dir_comment)
                
                df = pd.DataFrame(np.array(comments[i])[:,np.newaxis])
                df.to_csv(download_dir + '/Links.csv')
                
                for code in c_code[i]:
                    download_link = 'https://downloads.regulations.gov/'+ code +'/attachment_1.pdf'
                    download(download_link, download_dir_comment)
                
                i += 1

                
        else:
            for file_code in code_dict[file_type]:
                download_link = 'https://downloads.regulations.gov/' + file_code + '/content.pdf'
                download(download_link, sub_path)
    
    try:
        if check_comment:
            return True
    except:
        return False

### Run

1. Read in the docket_id table generated from "all_docket_id" notebook

In [151]:
df_docket = pd.read_excel('Table.xlsx', index_col = 0)

In [157]:
df_docket

Unnamed: 0,0
0,CFPB-2012-0037
1,CFPB-2012-0008
2,CFPB-2011-0009
3,CFPB-2012-0052
4,CFPB-2011-0007
5,CFPB-2013-0033
6,CFPB-2012-0051
7,CFPB-2012-0061
8,CFPB-2012-0032
9,HUD-2013-0093


### Record downloaded files

In [159]:
final_list = []
for docket_id in np.squeeze(np.array(df_docket)):
    url_dict = get_document_url(docket_id)
    check = download_all(docket_id, url_dict)
    df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in url_dict.items()]))
    
    parent_dir = "./downloads_3/"
    path = os.path.join(parent_dir, docket_id)
    df.to_csv(path + '/Links.csv')
    
    all_doc = list(url_dict.keys())
    if check:
        all_doc.append('Comment')
    
    final_list.append(all_doc)



In [160]:
final_list

[['Proposed Rule', 'Rule', 'Supporting & Related Material', 'Comment'],
 ['Notice'],
 ['Proposed Rule', 'Rule', 'Comment'],
 ['Notice'],
 ['Rule', 'Comment'],
 ['Proposed Rule', 'Comment'],
 ['Rule'],
 ['Notice', 'Comment'],
 ['Proposed Rule', 'Rule', 'Comment'],
 ['Proposed Rule', 'Supporting & Related Material', 'Rule', 'Comment'],
 ['Notice', 'Comment'],
 ['Notice', 'Comment'],
 ['Notice'],
 ['Rule', 'Comment'],
 ['Notice', 'Supporting & Related Material', 'Comment'],
 ['Notice'],
 ['Notice', 'Comment'],
 ['Notice', 'Supporting & Related Material'],
 ['Proposed Rule', 'Rule', 'Comment'],
 ['Notice'],
 ['Proposed Rule', 'Rule', 'Comment'],
 ['Proposed Rule', 'Comment'],
 ['Proposed Rule', 'Rule', 'Comment'],
 ['Notice', 'Comment'],
 ['Notice', 'Comment'],
 ['Rule', 'Comment'],
 ['Notice', 'Comment'],
 ['Notice', 'Comment'],
 ['Rule', 'Comment'],
 ['Proposed Rule', 'Comment'],
 ['Rule', 'Comment'],
 ['Rule', 'Other', 'Comment'],
 ['Notice'],
 ['Rule'],
 ['Proposed Rule', 'Rule', 'Comm

In [161]:
b = pd.DataFrame(final_list)

In [162]:
df_docket.index = range(len(df_docket))

In [163]:
df = pd.concat([df_docket ,b], axis = 1)

In [164]:
df

Unnamed: 0,0,0.1,1,2,3
0,CFPB-2012-0037,Proposed Rule,Rule,Supporting & Related Material,Comment
1,CFPB-2012-0008,Notice,,,
2,CFPB-2011-0009,Proposed Rule,Rule,Comment,
3,CFPB-2012-0052,Notice,,,
4,CFPB-2011-0007,Rule,Comment,,
5,CFPB-2013-0033,Proposed Rule,Comment,,
6,CFPB-2012-0051,Rule,,,
7,CFPB-2012-0061,Notice,Comment,,
8,CFPB-2012-0032,Proposed Rule,Rule,Comment,
9,HUD-2013-0093,Proposed Rule,Supporting & Related Material,Rule,Comment


In [208]:
fianl_sum.to_excel('Final_Summary_CFPB.xlsx', index = False)