# **Scrape**

Visit: https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697
        
Retrieve County name from FIPS. Match FIPS with statewide_db.csv

In [1]:
# Dependencies
from bs4 import BeautifulSoup
import pymongo
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import pandas as pd


# Dependencies
# ----------------------------------
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
from config import ADDRESS,PORTNUM,USERNAME,PW,DBNAMEPC
from sqlalchemy.orm import Session
from sqlalchemy import Table, MetaData, Column, Integer, String, ForeignKey
from sqlalchemy.orm import mapper


-------

**PostgreSQL Auth**

Create the connection

In [2]:
# Postgres username, password, and database name
POSTGRES_ADDRESS = ADDRESS ## INSERT YOUR DB ADDRESS IF IT'S NOT ON PANOPLY
POSTGRES_PORT = PORTNUM
POSTGRES_USERNAME = USERNAME ## CHANGE THIS TO YOUR PANOPLY/POSTGRES USERNAME
POSTGRES_PASSWORD = PW ## CHANGE THIS TO YOUR PANOPLY/POSTGRES PASSWORD
POSTGRES_DBNAME = DBNAMEPC ## CHANGE THIS TO YOUR DATABASE NAME

# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
    .format(username=POSTGRES_USERNAME,
    password=POSTGRES_PASSWORD,
    ipaddress=POSTGRES_ADDRESS,
    port=POSTGRES_PORT,
    dbname=POSTGRES_DBNAME))


-----

**Splinter**

In [3]:
# executable_path = {'executable_path': 'chromedriver.exe'}
# browser = Browser('chrome', **executable_path, headless=True)

In [4]:
url = 'https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697'
# # browser.visit(url)

-----

Create dataframe for viewing

In [5]:
table = pd.read_html(url)

In [6]:
fips_df_all = table[1]
type(fips_df_all)

pandas.core.frame.DataFrame

In [7]:
fips_df_ca = fips_df_all.loc[fips_df_all['State'] == 'CA']

In [8]:
# fips_df_ca

In [9]:
len(fips_df_ca)

58

In [10]:
# Check if all 58 counties are listed
assert len(fips_df_ca) == 58;
print('Good for upload to db.')

Good for upload to db.


------

# Upload to PostgreSQL

### District Name Num Table

In [11]:
# TEMPLATE: CountNamesNumber template to upload to specific table in db
# Create CountNamesNumber Classes
# Creates table with column names
# ----------------------------------
class FipsCountyNamesNumber(Base):
    __tablename__ = 'district_name_num'
    fips = Column(Integer, primary_key=True)
    county_name = Column(String(30))
    state = Column(String(2))
    

In [12]:
fips_list = fips_df_ca['FIPS'].tolist()
names_list = fips_df_ca['Name'].tolist()
state_list = fips_df_ca['State'].tolist()

In [13]:
# Create Database Connection
# ----------------------------------
# create engine to postgres connection
engine = create_engine(postgres_str)
conn = engine.connect()

In [14]:
# Create a "Metadata" Layer That Abstracts the SQL Database
# ----------------------------------
Base.metadata.create_all(engine)

In [15]:
# Create a Session Object to Connect to DB
# ----------------------------------
session = Session(bind=engine)

Loop through both list created. Assign value to be added individually to specified column

In [16]:
try:
    for fip, name, states in zip(fips_list, names_list, state_list):
#         print(f'{fip},{name},{state}')
        row = FipsCountyNamesNumber(fips=fip, county_name=name, state=states)
        session.add(row)
        session.commit()
    print('completed upload to db')
    
except Exception as e:
        print(f'error during upload. check db for partial information: {e}')
        print('===============================')
        
print('completed upload to db')

completed upload to db
completed upload to db


---------

### Statewide DB 
Clean and import data to postgresql

Import csv data. 
Create table for csv in postgresql.
Import data to table.

In [17]:
# Output File (CSV)
output_data_file = "../datasets/statewide_db.csv"

In [18]:
# Create DataFrame from csv
statewide_df = pd.read_csv('../datasets/statewide_db.csv', encoding='utf-8')
statewide_df.head()

Unnamed: 0,COUNTY,FIPS,SVPREC_KEY,SVPREC,ADDIST,CDDIST,SDDIST,BEDIST,TOTREG,DEMREG,...,USSREP03,USSREP04,USSREP05,USSREP06,USSREP07,USSREP08,USSREP09,USSREP10,USSREP11,USSREP12
0,49,6097,060971001,1001,2,5,2,2,230,0,...,0,0,0,0,0,0,0,0,0,0
1,49,6097,060971001A,1001A,2,5,2,2,0,0,...,0,3,1,0,10,3,0,0,10,2
2,49,6097,060971002,1002,2,5,2,2,24,0,...,0,0,0,0,0,0,0,0,0,0
3,49,6097,060971002A,1002A,2,5,2,2,0,0,...,0,0,0,0,0,0,0,0,1,0
4,49,6097,060971006,1006,2,5,2,2,2,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
len(statewide_df)

44297

In [20]:
# Select county and cddist columns
test_df = statewide_df[['COUNTY', 'FIPS', 'CDDIST']]
# test_df

In [21]:
# Create list of values for COUNTY
county_list = test_df['COUNTY'].tolist()
fips_list = test_df['FIPS'].tolist()
CDDIST_list = test_df['CDDIST'].tolist()


In [22]:
# Create list of values for CDDIST
CDDIST_list = test_df['CDDIST'].tolist()

In [23]:
# TEMPLATE: CongressTable template to upload to specific table in db
# Create CongressTable Classes
# Creates table with column names
# ----------------------------------
class CongressTable(Base):
    __tablename__ = 'statewide_db'
    _id = Column(Integer, primary_key=True)
    county_num = Column(Integer)
    fips = Column(Integer,ForeignKey('district_name_num.fips') )
    cddist = Column(Integer) 

In [24]:
# Create Database Connection
# ----------------------------------
# create engine to postgres connection
engine = create_engine(postgres_str)
conn = engine.connect()

In [25]:
# Create a "Metadata" Layer That Abstracts the SQL Database
# ----------------------------------
Base.metadata.create_all(engine)

In [26]:
# Create a Session Object to Connect to DB
# ----------------------------------
session = Session(bind=engine)

In [27]:
# Send date to postgresql

for county, fip, cddists in zip(county_list, fips_list, CDDIST_list):
    try:
#     print(f'{key},{value}')
        row = CongressTable(county_num=county, fips=fip, cddist=cddists)
        session.add(row)
        session.commit()
        

    except Exception as e:
        print(f'error during upload. check db for partial information: {e}')
        print('===============================')
        
print('completed upload to db')

completed upload to db


----------------