# Instructions

## Download Comments Source CSV
1. Download the fill list of available comments from [https://www.regulations.gov/docketBrowser?rpp=25&so=DESC&sb=commentDueDate&po=0&dct=PS&D=DOI-2017-0002]
2. Open DOCKET_DOI-2017-0002.csv and delete lines 1 - 5, e.g. everything before "Document Title,Document Type,Attachment Count..." 
3. Copy file (DOCKET_DOI-2017-0002.csv) to working directory

## Activate Source Environment
```bash
source activate benm
jupyter notebook
```



In [None]:
# start here!

# to do: download attachments

# import db_settings
import psycopg2
import os, errno, csv

from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from dateutil.parser import parse
from datetime import date

ignore_list = []

TIMEOUT = 15 # WebDriver timeout (page load, etc.), in seconds

def benm_driver():
    fp = webdriver.FirefoxProfile()
    fp.set_preference("http.response.timeout", TIMEOUT)
    fp.set_preference("dom.max_script_run_time", TIMEOUT)
    driver = webdriver.Firefox(firefox_profile=fp)
    driver.implicitly_wait(TIMEOUT) # seconds
    return driver

def get_comments(comments):
    conn = psycopg2.connect("dbname=benm user=postgres")
    conn.set_session(autocommit=True)
    cur = conn.cursor()
    query = 'INSERT INTO comments (document_id, tracking_number, date_posted, comment, has_attachments, retrieved) VALUES (%(document_id)s, %(tracking_number)s, %(date_posted)s, %(comment_text)s, %(has_attachments)s, now());'
    ignore_query = "INSERT INTO ignore_list (document_id, reason) VALUES (%s, %s);"

    driver = benm_driver()
    
    for comment in comments:
        comment_values = get_comment(driver, comment['Document Detail'], comment['Document ID'])
        if comment_values:
            try:
                cur.execute(query, comment_values)
            except Exception as ex:
                if ex.pgcode == '23505': # unique constraint violated
                    cur.execute(ignore_query, (comment['Document ID'], 'Duplicate Tracking Number'))
                    print('Duplicate tracking number found for comment %s.' % comment['Document ID'])
                else:
                    print(ex)
                    print('consider adding %s to ignore list' % comment['Document ID'])
                    raise
        else:
            print("Error retrieving document %s." % comment['Document ID'])
            driver.quit()
            driver = benm_driver()

    driver.quit()
    
    # conn.commit() # Make the changes to the database persistent. Not used if autocommit=True
    conn.close()
    
def get_comment(driver, url, source_document_id):
    try:
        driver.get(url)
        element = WebDriverWait(driver, TIMEOUT).until(
            EC.title_is("Regulations.gov - Comment")
        )

        element = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div[2]/div[3]/div/table/tbody/tr/td[3]/div/div/div[2]/div[1]/div[1]/span[2]')
        document_id = element.text
        assert document_id == source_document_id # make sure the page matches
        element = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div[2]/div[3]/div/table/tbody/tr/td[3]/div/div/div[2]/div[1]/div[2]/span[2]')
        tracking_number = element.text
        element = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div[2]/div[3]/div/table/tbody/tr/td[3]/div/div/div[2]/div[4]/div/div/span[2]')
        d = parse(element.text)
        date_posted = date(d.year, d.month, d.day)
        element = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div[2]/div[3]/div/table/tbody/tr/td[1]/div/div[3]/div[1]/div/div[2]')
        comment_text = element.text
        try:
            element = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div[2]/div[3]/div/table/tbody/tr/td[1]/div/div[3]/div[2]/div[1]/h2/span')
            has_attachments = (element.text == "Attachments")
        except NoSuchElementException:
            has_attachments = False

        result = { 
            'document_id': document_id,
            'tracking_number': tracking_number,
            'date_posted': date_posted,
            'comment_text': comment_text,
            'has_attachments': has_attachments
        }

        return result
    
    except TimeoutException as ex:
        return False
    except NoSuchElementException as ex:
        return False
    else:
        return False

In [None]:
# read list of comments to download from CSV
with open('DOCKET_DOI-2017-0002.csv', 'r') as f:
    reader = csv.DictReader(f)
    comments = [row for row in reader]

# ignore withdrawn comments and others of non "public submission" type
comments = [c for c in comments if c['Document Type'] == 'PUBLIC SUBMISSIONS']
  
comments = sorted(comments, key=lambda k: int(k['Document ID'][14:])) # sort in ascending order

with psycopg2.connect("dbname=benm user=postgres") as conn:
    with conn.cursor() as cur:
        try:
            cur.execute('SELECT document_id FROM comments;')
            downloaded_comments = set([c[0] for c in cur.fetchall()])
            
            cur.execute('SELECT document_id FROM ignore_list;')
            ignore_list = set([c[0] for c in cur.fetchall()])
        except psycopg2.Error as e:
            print (query)
            print (e.pgerror)
conn.close()

print('comments in db: ' + str(len(downloaded_comments)))
print('ignore list: ' + str(len(ignore_list)))

comments = [c for c in comments if not c['Document ID'] in downloaded_comments]
comments = [c for c in comments if not c['Document ID'] in ignore_list]

print('remaining comments: ' + str(len(comments)))

In [None]:
# spawn 8 workers to scrape batch_size comments each

from multiprocessing import Process

workers = 8
batch_size = int(len(comments) / workers)

processes = []

for i in range(workers):
    processes.append(Process(target=get_comments, args=(comments[i*batch_size:((i+1)*batch_size)-1],)))
    processes[i].start()

for i in range(workers):
    processes[i].join()

In [None]:
import pandas
import pandas.io.sql as psql

# write comments to CSV
conn = psycopg2.connect("dbname=benm user=postgres")
df = psql.read_sql("SELECT document_id, tracking_number, date_posted, retrieved, has_attachments, comment FROM comments;", conn)
df[['document_url']] = 'https://www.regulations.gov/document?D=' + df[['document_id']]

# sort rows by document id, then drop the internal database id column
df['id'] = df['document_id'].str[14:]
df = df.sort_values(by='id')
df = df.drop('id', 1) # 1 is the axis number, 0 for rows, 1 for columns
df.to_csv('dataset/comments.csv', index=False)

# write ignore list to CSV
df = psql.read_sql("SELECT * FROM ignore_list;", conn)
df[['document_url']] = 'https://www.regulations.gov/document?D=' + df[['document_id']]

# sort rows by document id, then drop the internal database id column
df['id'] = df['document_id'].str[14:]
df = df.sort_values(by='id')
df = df.drop('id', 1) # 1 is the axis number, 0 for rows, 1 for columns
df.to_csv('dataset/ignore_list.csv', index=False)


In [None]:
# to do - automate packaging?
# to package - create xlsx version, save as zip