In [None]:
# create database and tables

# PostgreSQL 9.x
# 1. create user with credentials in db_settings.py
# 2. run query below manually (e.g. using psql)
"""
CREATE DATABASE benm
    WITH 
    ENCODING = 'UTF8'
    CONNECTION LIMIT = -1;
CREATE USER benmuser WITH PASSWORD 'Ki3nslkj4nb';
GRANT ALL ON DATABASE benm TO benmuser;
\connect benm
ALTER SCHEMA public OWNER TO benmuser;
ALTER DATABASE benm OWNER TO benmuser;
ALTER DEFAULT PRIVILEGES 
    FOR USER benmuser
    IN SCHEMA public
    GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO benmuser;
    
"""   


# create the comments table. uncomment the execute line below and run
# to do: create the attachments table
import db_settings
import psycopg2

query = """
DROP TABLE IF EXISTS comments CASCADE;
CREATE TABLE comments
(
    id SERIAL PRIMARY KEY,
    document_id VARCHAR UNIQUE,
    tracking_number VARCHAR UNIQUE,
    date_posted DATE,
    retrieved TIMESTAMP,
    has_attachments BOOLEAN DEFAULT FALSE,
    comment TEXT
);
CREATE INDEX idx_document_id ON comments(document_id);
CREATE INDEX idx_date ON comments(date_posted);
"""

# Connect to the database
conn = psycopg2.connect(
        database=db_settings.DB,
        user=db_settings.USER,
        password=db_settings.PASSWD,
        host=db_settings.HOST)
with conn:
    with conn.cursor() as curs:
        # curs.execute(query) # uncomment this line to create the table
        # print(curs.statusmessage)
    
conn.close()


In [9]:
# to do: handle errors for duplicate keys
# to do: download attachments
# to do: incorporate ignore list into database

import db_settings
import psycopg2
import os, errno, csv

from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from dateutil.parser import parse
from datetime import date

ignore_list = []

def get_comment(driver, url, source_document_id):
    try:
        driver.get(url)
        element = WebDriverWait(driver, 10).until(
            EC.title_is("Regulations.gov - Comment")
        )

        element = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div[2]/div[3]/div/table/tbody/tr/td[3]/div/div/div[2]/div[1]/div[1]/span[2]')
        document_id = element.text
        assert document_id == source_document_id # make sure the page matches
        element = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div[2]/div[3]/div/table/tbody/tr/td[3]/div/div/div[2]/div[1]/div[2]/span[2]')
        tracking_number = element.text
        element = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div[2]/div[3]/div/table/tbody/tr/td[3]/div/div/div[2]/div[4]/div/div/span[2]')
        d = parse(element.text)
        date_posted = date(d.year, d.month, d.day)
        element = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div[2]/div[3]/div/table/tbody/tr/td[1]/div/div[3]/div[1]/div/div[2]')
        comment_text = element.text
        try:
            element = driver.find_element(By.XPATH, '/html/body/div[3]/div[2]/div[2]/div[3]/div/table/tbody/tr/td[1]/div/div[3]/div[2]/div[1]/h2/span')
            has_attachments = (element.text == "Attachments")
        except NoSuchElementException:
            has_attachments = False

        result = { 
            'document_id': document_id,
            'tracking_number': tracking_number,
            'date_posted': date_posted,
            'comment_text': comment_text,
            'has_attachments': has_attachments
        }

        return result
    
    except TimeoutException as ex:
        return False
    except NoSuchElementException as ex:
        return False
    else:
        return False

def benm_driver():
    fp = webdriver.FirefoxProfile()
    fp.set_preference("http.response.timeout", 15)
    fp.set_preference("dom.max_script_run_time", 15)
    driver = webdriver.Firefox(firefox_profile=fp)
    driver.implicitly_wait(15) # seconds
    return driver

def get_comments(comments):
    conn = psycopg2.connect(database=db_settings.DB, user=db_settings.USER, password=db_settings.PASSWD, host=db_settings.HOST)
    conn.set_session(autocommit=True)
    cur = conn.cursor()
    query = 'INSERT INTO comments (document_id, tracking_number, date_posted, comment, has_attachments, retrieved) VALUES (%(document_id)s, %(tracking_number)s, %(date_posted)s, %(comment_text)s, %(has_attachments)s, now());'
    
    driver = benm_driver()
    
    for comment in comments:
        comment_values = get_comment(driver, comment['Document Detail'], comment['Document ID'])
        if comment_values:
            cur.execute(query, comment_values)
            # to do: handle duplicate keys
        else:
            ignore_list.append(comment['Document ID'])
            driver.quit()
            driver = benm_driver()

    driver.quit()
    
    # conn.commit() # Make the changes to the database persistent
    conn.close()

In [10]:
# read list of comments to download from CSV
with open('DOCKET_DOI-2017-0002.csv', 'r') as f:
    reader = csv.DictReader(f)
    comments = [row for row in reader]

comments = [c for c in comments if c['Document Type'] == 'PUBLIC SUBMISSIONS']
    
comments = sorted(comments, key=lambda k: int(k['Document ID'][14:])) # sort in ascneindg order

# description comment
ignore_list.append('DOI-2017-0002-0001')

# duplicates
ignore_list.append('DOI-2017-0002-24709') # duplicate tracking Number: 1k1-8wf9-5su4
ignore_list.append('DOI-2017-0002-24749')
ignore_list.append('DOI-2017-0002-31984')
ignore_list.append('DOI-2017-0002-91639')
ignore_list.append('DOI-2017-0002-64952')
ignore_list.append('DOI-2017-0002-91832')
ignore_list.append('DOI-2017-0002-61934')
ignore_list.append('DOI-2017-0002-105034')
ignore_list.append('DOI-2017-0002-30092')
ignore_list.append('DOI-2017-0002-36651')
ignore_list.append('DOI-2017-0002-63026')
ignore_list.append('DOI-2017-0002-30822')
ignore_list.append('DOI-2017-0002-94248')
ignore_list.append('DOI-2017-0002-94323')
ignore_list.append('DOI-2017-0002-12214')
ignore_list.append('DOI-2017-0002-82613')
ignore_list.append('DOI-2017-0002-14302')
ignore_list.append('DOI-2017-0002-106094')
ignore_list.append('DOI-2017-0002-92237')
ignore_list.append('DOI-2017-0002-85466')
ignore_list.append('DOI-2017-0002-18278')
ignore_list.append('DOI-2017-0002-24722')
ignore_list.append('DOI-2017-0002-88668')
ignore_list.append('DOI-2017-0002-24740')
ignore_list.append('DOI-2017-0002-90447')
ignore_list.append('DOI-2017-0002-95787')
ignore_list.append('DOI-2017-0002-75285')
ignore_list.append('DOI-2017-0002-63685')
ignore_list.append('DOI-2017-0002-75763')
ignore_list.append('DOI-2017-0002-91019')
ignore_list.append('DOI-2017-0002-60433')
ignore_list.append('DOI-2017-0002-85266')
ignore_list.append('DOI-2017-0002-91567')
ignore_list.append('DOI-2017-0002-91285')
ignore_list.append('DOI-2017-0002-88631')
ignore_list.append('DOI-2017-0002-64214')


# general errors
ignore_list.append('DOI-2017-0002-99259')


conn = psycopg2.connect(database=db_settings.DB, user=db_settings.USER, password=db_settings.PASSWD, host=db_settings.HOST)
with conn:
    with conn.cursor() as cur:
        query = 'SELECT document_id FROM comments;'
        try:
            cur.execute(query)
            downloaded_comments = set([c[0] for c in cur.fetchall()])
        except psycopg2.Error as e:
            print (query)
            print (e.pgerror)
conn.close()

print('comments in db: ' + str(len(downloaded_comments)))

comments = [c for c in comments if not c['Document ID'] in downloaded_comments]
comments = [c for c in comments if not c['Document ID'] in ignore_list]

print('remaining comments: ' + str(len(comments)))


comments in db: 103633
remaining comments: 15547


In [None]:
# download comments. This will scrape everything, but hides errors.

# from multiprocessing import Process
# from multiprocessing import Pool

# n = 50 # batch size

# with Pool(processes=4, maxtasksperchild=1) as pool:
#     pool.map_async(get_comments, [comments[i:i + n] for i in range(0, len(comments), n)], chunksize=1).get(99999)


#     pool.close()
#     pool.join()

In [8]:
# spawn 8 workers to scrape 5000 comments each. Repeat as necessary.

from multiprocessing import Process

batch_size = 2000

processes = []

for i in range(8):
    processes.append(Process(target=get_comments, args=(comments[i*batch_size:((i+1)*batch_size)-1],)))
    processes[i].start()

for i in range(8):
    processes[i].join()

Process Process-3:
KeyboardInterrupt
Process Process-4:
Process Process-6:
Process Process-7:
Process Process-8:
Process Process-1:
Process Process-2:
Process Process-5:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "<ipython-input-1-730fa538ec37>", line 24, in get_comment
    EC.title_is("Regulations.gov - Comment")
  File "<ipython-input-1-730fa538ec37>", line 24, in get_comment
    EC.title_is("Regulations.gov - Comment")
  File "<ipython-input-1-730fa538ec37>", line 24, in get_comment
    EC.title_is("Regulations.gov - Comment")
  File "<ipython-input-1-730fa538ec37>", line 24, in get_comment
    EC.title_is("Regulations.gov - Comment")
  File "<ipython-input-1-730fa538ec37>", line 24, in get_comment
    EC.title_is("Regulations.gov - Comment")
  File "/Users/markegge/anaconda3/e

  File "<ipython-input-1-730fa538ec37>", line 55, in get_comment
    except NoSuchElementException as ex:
  File "<ipython-input-1-730fa538ec37>", line 55, in get_comment
    except NoSuchElementException as ex:
NameError: name 'NoSuchElementException' is not defined
  File "<ipython-input-1-730fa538ec37>", line 55, in get_comment
    except NoSuchElementException as ex:
  File "/Users/markegge/anaconda3/envs/benm/lib/python3.6/site-packages/selenium/webdriver/remote/remote_connection.py", line 488, in _request
    resp = self._conn.getresponse()
NameError: name 'NoSuchElementException' is not defined
  File "/Users/markegge/anaconda3/envs/benm/lib/python3.6/http/client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "/Users/markegge/anaconda3/envs/benm/lib/python3.6/site-packages/selenium/webdriver/remote/remote_connection.py", line 464, in execute
    return self._request(command_info[0], url, body=data)
NameError: name 'NoSuchElementException' is not

KeyboardInterrupt: 

  File "<ipython-input-1-730fa538ec37>", line 55, in get_comment
    except NoSuchElementException as ex:
  File "/Users/markegge/anaconda3/envs/benm/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
NameError: name 'NoSuchElementException' is not defined
  File "<ipython-input-1-730fa538ec37>", line 77, in get_comments
    comment_values = get_comment(driver, comment['Document Detail'], comment['Document ID'])
  File "<ipython-input-1-730fa538ec37>", line 55, in get_comment
    except NoSuchElementException as ex:
NameError: name 'NoSuchElementException' is not defined
