# **Scrape**

Visit: https://statewidedatabase.org/d10/p16.html
        
Retrieve County name and number from each thead. 

In [1]:
# Dependencies
from bs4 import BeautifulSoup
import pymongo
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import pandas as pd


# Dependencies
# ----------------------------------
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
from config import ADDRESS,PORTNUM,USERNAME,PW,DBNAMEPC
from sqlalchemy.orm import Session
from sqlalchemy import Table, MetaData, Column, Integer, String, ForeignKey
from sqlalchemy.orm import mapper


-------

**PostgreSQL Auth**

Create the connection

In [2]:
# Postgres username, password, and database name
POSTGRES_ADDRESS = ADDRESS ## INSERT YOUR DB ADDRESS IF IT'S NOT ON PANOPLY
POSTGRES_PORT = PORTNUM
POSTGRES_USERNAME = USERNAME ## CHANGE THIS TO YOUR PANOPLY/POSTGRES USERNAME
POSTGRES_PASSWORD = PW ## CHANGE THIS TO YOUR PANOPLY/POSTGRES PASSWORD
POSTGRES_DBNAME = DBNAMEPC ## CHANGE THIS TO YOUR DATABASE NAME

# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
    .format(username=POSTGRES_USERNAME,
    password=POSTGRES_PASSWORD,
    ipaddress=POSTGRES_ADDRESS,
    port=POSTGRES_PORT,
    dbname=POSTGRES_DBNAME))


-----

**Splinter**

In [3]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

In [4]:
url = 'https://statewidedatabase.org/d10/p16.html'
browser.visit(url)

-----

In [5]:
# HTML object
html = browser.html
# Instantiate a BeautifulSoup() object with our `html` and the `html5lib` parser
soup = BeautifulSoup(html, 'html5lib')

# Article holding pics
tbody = soup.find('tbody')

In [6]:
# tr holds all a:text which also holds county number
tr = tbody.find_all('a', class_='county-num')

In [7]:
# Empty list for loop
county_number_list = []

# Find County Number
for t in tr:
    # County number: County 001
    county_number = t.next_element
    
    # initializing sub list. Remove the str 'COUNTY ' 
    sub_list = ["COUNTY "] 
    
    # For item in list, remove string found in sub_list and replace with what is left over.
    for sub in sub_list: 
        county_str = county_number.replace(sub, ' ') 
        res = " ".join(county_str.split())
    
    # Add results to list
    county_number_list.append(res)
# When finished, print list
# print(county_number_list)

In [8]:
# tbody holds all thead which has city name. county number is also here but within anchor text
tbody = soup.find('tbody')
th = tbody.find_all('th')

# Empty name list for loop
county_name_list = []

# Loop through headers and grab city name while avoiding anchor
for t in th:
    th = t.next_element.next_element.next_element.next_element
    county_name_list.append(th)

# Scraping included a row we didnt need. Removing.
remove_first_item_in_list = list(county_name_list.pop(0))
# print(county_name_list)
browser.quit()

In [9]:
# Combine list to dict for dataframe cleaning
county_num_name_dict = dict(zip(county_number_list, county_name_list))

Create dataframe for viewing

In [10]:
df_county_name_num = pd.DataFrame(columns=['county', 'county_name'])
df_county_name_num['county'] = county_num_name_dict.keys()
df_county_name_num['county_name'] = county_num_name_dict.values()
# df_county_name_num

In [11]:
# Check if all 58 counties are listed
assert len(df_county_name_num) == 58;
print('Good for upload to db.')

Good for upload to db.


------

# Upload to PostgreSQL

In [12]:
# TEMPLATE: CountNamesNumber template to upload to specific table in db
# Create CountNamesNumber Classes
# Creates table with column names
# ----------------------------------
class CountyNamesNumber(Base):
    __tablename__ = 'county_names'
    county = Column(Integer)
    county_name = Column(String(30), primary_key=True)
    

-------

In [13]:
# TEMPLATE: CountNamesNumber template to upload to specific table in db
# Create CountNamesNumber Classes
# Creates table with column names
# ----------------------------------
# class testNamesNumber(Base):
#     __tablename__ = 'test_table'
#     COUNTY = Column(Integer, primary_key=True)
#     county_name = Column(String(30))

In [17]:
# test_row = testNamesNumber(COUNTY=49)
# session.add(test_row)
# session.commit()

-------

In [13]:
# Create Database Connection
# ----------------------------------
# create engine to postgres connection
engine = create_engine(postgres_str)
conn = engine.connect()

In [14]:
# Create a "Metadata" Layer That Abstracts the SQL Database
# ----------------------------------
Base.metadata.create_all(engine)

In [15]:
# Create a Session Object to Connect to DB
# ----------------------------------
session = Session(bind=engine)

Loop through both list created. Assign value to be added individually to specified column

In [16]:
try:
    for key, value in zip(county_number_list, county_name_list):
    #     print(f'{key},{value}')
        row = CountyNamesNumber(county=key, county_name=value)
        session.add(row)
        session.commit()
    print('completed upload to db')
except:
    print('error during upload. check db for partial information.')

completed upload to db


---------

# Statewide DB 
Clean and import data to postgresql

Import csv data. 
Create table for csv in postgresql.
Import data to table.

In [17]:
# Output File (CSV)
output_data_file = "../datasets/statewide_db.csv"

In [18]:
# Create DataFrame from csv
statewide_df = pd.read_csv('../datasets/statewide_db.csv', encoding='utf-8')
statewide_df.head()

Unnamed: 0,COUNTY,FIPS,SVPREC_KEY,SVPREC,ADDIST,CDDIST,SDDIST,BEDIST,TOTREG,DEMREG,...,USSREP03,USSREP04,USSREP05,USSREP06,USSREP07,USSREP08,USSREP09,USSREP10,USSREP11,USSREP12
0,49,6097,060971001,1001,2,5,2,2,230,0,...,0,0,0,0,0,0,0,0,0,0
1,49,6097,060971001A,1001A,2,5,2,2,0,0,...,0,3,1,0,10,3,0,0,10,2
2,49,6097,060971002,1002,2,5,2,2,24,0,...,0,0,0,0,0,0,0,0,0,0
3,49,6097,060971002A,1002A,2,5,2,2,0,0,...,0,0,0,0,0,0,0,0,1,0
4,49,6097,060971006,1006,2,5,2,2,2,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
len(test_df)

44297

In [19]:
# Select county and cddist columns
test_df = statewide_df[['COUNTY', 'CDDIST']]
# test_df

In [20]:
# Create list of values for COUNTY
county_list = test_df['COUNTY'].tolist()

In [21]:
# Create list of values for CDDIST
CDDIST_list = test_df['CDDIST'].tolist()

In [22]:
# TEMPLATE: CongressTable template to upload to specific table in db
# Create CongressTable Classes
# Creates table with column names
# ----------------------------------
class CongressTable(Base):
    __tablename__ = 'statewide_db'
    _id = Column(Integer, primary_key=True)
    COUNTY = Column(Integer, ForeignKey('county_names.county'))
    CDDIST = Column(Integer)
    

In [23]:
# Create Database Connection
# ----------------------------------
# create engine to postgres connection
engine = create_engine(postgres_str)
conn = engine.connect()

In [24]:
# Create a "Metadata" Layer That Abstracts the SQL Database
# ----------------------------------
Base.metadata.create_all(engine)

In [25]:
# Create a Session Object to Connect to DB
# ----------------------------------
session = Session(bind=engine)

In [26]:
# Send date to postgresql

for key, value in zip(county_list, CDDIST_list):
    try:
#     print(f'{key},{value}')
        row = CongressTable(COUNTY=key, CDDIST=value)
        session.add(row)
        session.commit()
        

    except Exception as e:
        print(f'error during upload. check db for partial information: {e}')
        print('===============================')
        
print('completed upload to db')

error during upload. check db for partial information: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(CDDIST)s) RETURNING statewide_db._id]
[parameters: {'COUNTY': 48, 'CDDIST': 5}]
(Background on this error at: http://sqlalche.me/e/gkpj)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(C

(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(CDDIST)s) RETURNING statewide_db._id]
[parameters: {'COUNTY': 48, 'CDDIST': 5}]
(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new trans

(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(CDDIST)s) RETURNING statewide_db._id]
[parameters: {'COUNTY': 48, 'CDDIST': 5}]
(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new trans

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(CDDIST)s) RETURNING statewide_db._id]
[parameters: {'COUNTY': 48, 'CDDIST': 5}]
(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViola

(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(CDDIST)s) RETURNING statewide_db._id]
[parameters: {'COUNTY': 48, 'CDDIST': 5}]
(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new trans

error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(CDDIST)s) RETURNING statewide_db._id]
[parameters: {'COUNTY': 48, 'CDDIST': 5}]
(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViola

error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(CDDIST)s) RETURNING statewide_db._id]
[parameters: {'COUNTY': 48, 'CDDIST': 5}]
(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViola

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(CDDIST)s) RETURNING statewide_db._id]
[parameters: {'COUNTY': 48, 'CDDIST': 5}]
(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViola

error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(CDDIST)s) RETURNING statewide_db._id]
[parameters: {'COUNTY': 48, 'CDDIST': 5}]
(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViola

(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (psycopg2.errors.ForeignKeyViolation) insert or update on table "statewide_db" violates foreign key constraint "statewide_db_COUNTY_fkey"
DETAIL:  Key (COUNTY)=(48) is not present in table "county_names".

[SQL: INSERT INTO statewide_db ("COUNTY", "CDDIST") VALUES (%(COUNTY)s, %(CDDIST)s) RETURNING statewide_db._id]
[parameters: {'COUNTY': 48, 'CDDIST': 5}]
(Background on this error at: http://sqlalche.me/e/gkpj) (Background on this error at: http://sqlalche.me/e/7s2a)
error during upload. check db for partial information: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new trans

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [27]:
# # Send date to postgresql
# try:
#     for key, value in zip(county_list, CDDIST_list):
#     #     print(f'{key},{value}')
#         row = CongressTable(COUNTY=key, CDDIST=value)
#         session.add(row)
#         session.commit()
#     print('completed upload to db')
# except Exception as e:
#     print(f'error during upload. check db for partial information: {e}')

----------------