## Scraping
The **first time** you run this script, follow the instructions in the cell below to create the database and tables.

For subsequent runs, start with cell #4

In [None]:
# create database and tables

# PostgreSQL 9.x
# 2. run query below manually (e.g. using psql)
"""
CREATE DATABASE benm
    WITH 
    ENCODING = 'UTF8'
    CONNECTION LIMIT = -1;
CREATE USER benmuser WITH PASSWORD 'Ki3nslkj4nb';
GRANT ALL ON DATABASE benm TO benmuser;
\connect benm
ALTER SCHEMA public OWNER TO benmuser;
ALTER DATABASE benm OWNER TO benmuser;
ALTER DEFAULT PRIVILEGES 
    FOR USER benmuser
    IN SCHEMA public
    GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO benmuser;
    
"""   


# create the comments table. uncomment the execute line below and run
# to do: create the attachments table
import psycopg2

query = """
DROP TABLE IF EXISTS comments CASCADE;
CREATE TABLE comments
(
    id SERIAL PRIMARY KEY,
    document_id VARCHAR UNIQUE,
    tracking_number VARCHAR UNIQUE,
    date_posted DATE,
    retrieved TIMESTAMP,
    has_attachments BOOLEAN DEFAULT FALSE,
    comment TEXT
);
CREATE INDEX idx_document_id ON comments(document_id);
CREATE INDEX idx_date ON comments(date_posted);
"""
with psycopg2.connect("dbname=benm user=postgres") as conn:
    with conn.cursor() as curs:
        # curs.execute(query) # uncomment this line to create the table
        # print(curs.statusmessage)
    
conn.close()


# create ignore list table
query = """
DROP TABLE IF EXISTS ignore_list CASCADE;
CREATE TABLE ignore_list
(
    id SERIAL PRIMARY KEY,
    document_id VARCHAR UNIQUE NOT NULL,
    tracking_number VARCHAR,
    reason TEXT
);
CREATE INDEX idx_ignore_document_id ON ignore_list (document_id);
"""
with psycopg2.connect("dbname=benm user=postgres") as conn:
    with conn.cursor() as curs:
        # curs.execute(query) # uncomment this line to create the table
        # print(curs.statusmessage)
conn.close()

In [None]:
# run the code below to manually add a comment to the ignore list
import psycopg2

# general errors
ignore_document_id = '' # e.g. 'DOI-2017-0002-99259'
ignore_reason = '' # e.g. 'Produces unknown error'

conn = psycopg2.connect("dbname=benm user=postgres")
cur = conn.cursor()
query = "INSERT INTO ignore_list (document_id, reason) VALUES (%s, %s)"

try:
    cur.execute(query, (ignore_document_id, ignore_reason) )
    print(cur.statusmessage)
except Exception as ex:
    print(ex)
    
conn.commit()
cur.close()
conn.close()