# ETL Project

**Objective:** Gather data sources, transform and load into a database. 
Data involves how much campaign contributions each candidate received and success of campaign. Seeking information into spending contribution on post election results specifically on 2016


**Team:** Inquisitive Otus<br>
**Team Members:** Claudia Flores, Sheng Le, Christian Pompa

In [37]:
# Dependencies
from bs4 import BeautifulSoup
import pymongo
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import pandas as pd
import requests
import json
from pprint import pprint

# SQL Dependencies
# ----------------------------------
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
from config import api_key, username, password, ipaddress, port, dbname
from sqlalchemy.orm import Session
from sqlalchemy import Table, MetaData, Column, Integer, String, ForeignKey
from sqlalchemy.orm import mapper

-------

### **Scrape**

Visit: https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697
        
Retrieve County name from FIPS. Match FIPS with statewide_db.csv

**PostgreSQL Auth**

Create the connection

In [38]:
# A long string that contains the necessary Postgres login information
postgres_str = (f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}')

-----

**Splinter**

In [39]:
# executable_path = {'executable_path': 'chromedriver.exe'}
# browser = Browser('chrome', **executable_path, headless=True)

In [40]:
url = 'https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697'
# # browser.visit(url)

-----

Create dataframe for viewing

In [41]:
table = pd.read_html(url)

In [42]:
fips_df_all = table[1]
type(fips_df_all)

pandas.core.frame.DataFrame

In [43]:
fips_df_ca = fips_df_all.loc[fips_df_all['State'] == 'CA']

In [44]:
# fips_df_ca

In [45]:
len(fips_df_ca)

58

In [46]:
# Check if all 58 counties are listed
assert len(fips_df_ca) == 58;
print('Good for upload to db.')

Good for upload to db.


------

## Upload to PostgreSQL

### District Name Num Table

In [47]:
# TEMPLATE: CountNamesNumber template to upload to specific table in db
# Create CountNamesNumber Classes
# Creates table with column names
# ----------------------------------
class FipsCountyNamesNumber(Base):
    __tablename__ = 'district_name_num'
    fips = Column(Integer, primary_key=True)
    county_name = Column(String(30))
    state = Column(String(2))
    

In [48]:
fips_list = fips_df_ca['FIPS'].tolist()
names_list = fips_df_ca['Name'].tolist()
state_list = fips_df_ca['State'].tolist()

In [49]:
# Create Database Connection
# ----------------------------------
# create engine to postgres connection
engine = create_engine(postgres_str)
conn = engine.connect()

In [50]:
# Create a "Metadata" Layer That Abstracts the SQL Database
# ----------------------------------
Base.metadata.create_all(engine)

In [51]:
# Create a Session Object to Connect to DB
# ----------------------------------
session = Session(bind=engine)

Loop through both list created. Assign value to be added individually to specified column

In [52]:
try:
    for fip, name, states in zip(fips_list, names_list, state_list):
#         print(f'{fip},{name},{state}')
        row = FipsCountyNamesNumber(fips=fip, county_name=name, state=states)
        session.add(row)
        session.commit()
    print('completed upload to db')
    
except Exception as e:
        print(f'error during upload. check db for partial information: {e}')
        print('===============================')
        
print('completed upload to db')

completed upload to db
completed upload to db


----------

### Election Table

In [53]:
# Specify the URL
elections_url = "https://api.open.fec.gov/v1/elections/?"

In [54]:
# Create districts list
districts=[]

# Loop through integers to append into the list
for n in range(1, 54):
    i=str(n).zfill(2)
    districts.append(i)
    n+=1
    
# Print the list
print(districts)

['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53']


In [55]:
# Set params in URL
sort_null_only="true"
office="house"
state="CA"
per_page="34"
sort_nulls_last="true"
sort_hide_null="true"
cycle="2016"
sort="-total_receipts"
hide_null="true"
page=1
election_full="true"

In [56]:
# Create an empty dataframe
combined_df=pd.DataFrame()

# Create empty lists
committee_ids=[]
cash_on_hand_end_period=[]
candidate_pcc_id=[]
total_receipts=[]
coverage_end_date=[]
candidate_election_year=[]
candidate_name=[]
total_disbursements=[]
party_full=[]
candidate_id=[]

In [57]:
# Loop through all districts ID in districts
for district in districts:
    
    # Define params
    params= {
        "sort_null_only":sort_null_only,
        "office":office,
        "state":state,
        "per_page":per_page,
        "sort_nulls_last":sort_nulls_last,
        "sort_hide_null":sort_hide_null,
        "cycle":cycle,
        "sort":sort,
        "hide_null":hide_null,
        "page":page,
        "election_full":election_full,
        "api_key":api_key,
        "district":district
    }

    # Run request in JSON
    response=requests.get(elections_url,params=params).json()
    
    # Set variable to represent only results from the API
    results=response["results"]
    
    # Loop through every result in results
    for result in results:

        # Try to add information from results to the empty list
        try:
            committee_ids.append(result["committee_ids"][0])
            cash_on_hand_end_period.append(result["cash_on_hand_end_period"])
            candidate_pcc_id.append(result["candidate_pcc_id"])
            total_receipts.append(result["total_receipts"])
            coverage_end_date.append(result["coverage_end_date"])
            candidate_election_year.append(result["candidate_election_year"])
            candidate_name.append(result["candidate_name"])
            total_disbursements.append(result["total_disbursements"])
            party_full.append(result["party_full"])
            candidate_id.append(result["candidate_id"])

        # Otherwise return none
        except:
            None

    # Create a dataframe to show all results
    election_df = pd.DataFrame({
        "Committee ID": committee_ids,
        "Cash on Hand End Period": cash_on_hand_end_period,
        "Candidate PCC ID":candidate_pcc_id,
        "Total Receipts": total_receipts,
        "End Date": coverage_end_date,
        "Election Year": candidate_election_year,
        "Candidate Name": candidate_name,
        "Total Disbursements": total_disbursements,
        "Party": party_full,
        "Candidate ID": candidate_id,
        "district":district
    })
    
    # Concat each election_df for one district into a combined dataframe with all districts
    combined_df=pd.concat([combined_df, election_df], ignore_index=True)

In [58]:
district_no= combined_df['district'].to_list()

### Save query to json

In [59]:
# Save query to json
with open('../datasets/election_api_query.json', 'w') as fp:
      json.dump(election_df.to_dict(), fp)

In [60]:
# Print the dataframe
combined_df

Unnamed: 0,Committee ID,Cash on Hand End Period,Candidate PCC ID,Total Receipts,End Date,Election Year,Candidate Name,Total Disbursements,Party,Candidate ID,district
0,C00509422,89475.60,C00509422,800277.02,2016-12-31T00:00:00,2016,"LAMALFA, DOUG",808869.52,REPUBLICAN PARTY,H2CA02142,01
1,C00608265,0.00,C00608265,183682.00,2016-12-31T00:00:00,2016,"MONTES, JOSEPH",183682.00,REPUBLICAN PARTY,H6CA01186,01
2,C00609958,3592.42,C00609958,138508.99,2016-12-31T00:00:00,2016,"REED, JAMES E",134916.57,DEMOCRATIC PARTY,H6CA01194,01
3,C00605535,0.00,C00605535,27709.03,2016-05-10T00:00:00,2016,"WRIGHT, DOUGLAS A.",27709.03,REPUBLICAN PARTY,H6CA01178,01
4,C00615104,0.00,C00615104,0.00,,2016,"OXLEY, GARY ALLEN",0.00,REPUBLICAN PARTY,H2CA01151,01
...,...,...,...,...,...,...,...,...,...,...,...
4874,C00546861,0.00,C00610576,517.93,2015-04-05T00:00:00,2016,"SIMON, FRED J JR MD",790.44,REPUBLICAN PARTY,H4CA52077,53
4875,C00344671,255984.17,C00344671,485051.35,2016-12-31T00:00:00,2016,"DAVIS, SUSAN",470870.68,DEMOCRATIC PARTY,H0CA49055,53
4876,C00575860,1646.88,C00575860,122886.07,2016-12-31T00:00:00,2016,"VELTMEYER, JAMES",121239.19,REPUBLICAN PARTY,H6CA53054,53
4877,C00573915,0.00,C00573915,35278.00,2016-07-11T00:00:00,2016,"ASH, JAMES",35278.00,REPUBLICAN PARTY,H6CA53047,53


## Upload to PostgreSQL

In [61]:
# TEMPLATE: ElectionTable template to upload to specific table in db
# Create ElectionTable Classes
# Creates table with column names
# ----------------------------------
class ElectionTable(Base):
    _id = Column(Integer, primary_key=True)
    __tablename__ = 'election_table'
    Committee_ID = Column(String(30))
    Cash_on_Hand_End_Period = Column(String(30))
    Candidate_PCC_ID = Column(String(30))
    Total_Receipts = Column(String(30))
    End_Date = Column(String(30))
    Election_Year = Column(Integer)
    Candidate_Name = Column(String(30))
    Total_Disbursements = Column(String(30))
    Party = Column(String(30))
    Candidate_ID = Column(String(30))
    district = Column(Integer)

In [62]:
# Create Database Connection
# ----------------------------------
# create engine to postgres connection
engine = create_engine(postgres_str)
conn = engine.connect()

# Create a "Metadata" Layer That Abstracts the SQL Database
# ----------------------------------
Base.metadata.create_all(engine)

# Create a Session Object to Connect to DB
# ----------------------------------
session = Session(bind=engine)

In [63]:
# Send date to postgresql

for committee, cash, candidate, total_r, coverage_end, candidate_election, candidate_n, total_dis, party_f, candidate_i, dis_no in zip(committee_ids, cash_on_hand_end_period, candidate_pcc_id, total_receipts, coverage_end_date, candidate_election_year, candidate_name, total_disbursements, party_full, candidate_id, district_no):
    try:
#     print(f'{key},{value}')
        row = ElectionTable(
            Committee_ID=committee, 
            Cash_on_Hand_End_Period=cash, 
            Candidate_PCC_ID = candidate,
            Total_Receipts = total_r,
            End_Date = coverage_end,
            Election_Year = candidate_election,
            Candidate_Name = candidate_n,
            Total_Disbursements = total_dis,
            Party = party_f,
            Candidate_ID = candidate_i,
            district = dis_no,
            )
        session.add(row)
        session.commit()
        

    except Exception as e:
        print(f'error during upload. check db for partial information: {e}')
        print('===============================')
        
print('completed upload to db')

completed upload to db


---------

### Statewide DB 
Clean and import data to postgresql

Import csv data. 
Create table for csv in postgresql.
Import data to table.

In [64]:
# Output File (CSV)
output_data_file = "../datasets/statewide_db.csv"

In [65]:
# Create DataFrame from csv
statewide_df = pd.read_csv('../datasets/statewide_db.csv', encoding='utf-8')
statewide_df.head()

Unnamed: 0,COUNTY,FIPS,SVPREC_KEY,SVPREC,ADDIST,CDDIST,SDDIST,BEDIST,TOTREG,DEMREG,...,USSREP03,USSREP04,USSREP05,USSREP06,USSREP07,USSREP08,USSREP09,USSREP10,USSREP11,USSREP12
0,49,6097,060971001,1001,2,5,2,2,230,0,...,0,0,0,0,0,0,0,0,0,0
1,49,6097,060971001A,1001A,2,5,2,2,0,0,...,0,3,1,0,10,3,0,0,10,2
2,49,6097,060971002,1002,2,5,2,2,24,0,...,0,0,0,0,0,0,0,0,0,0
3,49,6097,060971002A,1002A,2,5,2,2,0,0,...,0,0,0,0,0,0,0,0,1,0
4,49,6097,060971006,1006,2,5,2,2,2,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
len(statewide_df)

44297

In [67]:
# Select county and cddist columns
test_df = statewide_df[['COUNTY', 'FIPS', 'CDDIST']]
# test_df

In [68]:
# Create list of values for COUNTY
county_list = test_df['COUNTY'].tolist()
fips_list = test_df['FIPS'].tolist()
CDDIST_list = test_df['CDDIST'].tolist()


In [69]:
# Create list of values for CDDIST
CDDIST_list = test_df['CDDIST'].tolist()

## Upload to PostgreSQL

In [70]:
# TEMPLATE: CongressTable template to upload to specific table in db
# Create CongressTable Classes
# Creates table with column names
# ----------------------------------
class CongressTable(Base):
    __tablename__ = 'statewide_db'
    _id = Column(Integer, primary_key=True)
    county_num = Column(Integer)
    fips = Column(Integer,ForeignKey('district_name_num.fips') )
    cddist = Column(Integer) 

In [71]:
# Create Database Connection
# ----------------------------------
# create engine to postgres connection
engine = create_engine(postgres_str)
conn = engine.connect()

# Create a "Metadata" Layer That Abstracts the SQL Database
# ----------------------------------
Base.metadata.create_all(engine)

# Create a Session Object to Connect to DB
# ----------------------------------
session = Session(bind=engine)

In [72]:
# Send date to postgresql

for county, fip, cddists in zip(county_list, fips_list, CDDIST_list):
    try:
#     print(f'{key},{value}')
        row = CongressTable(county_num=county, fips=fip, cddist=cddists)
        session.add(row)
        session.commit()
        

    except Exception as e:
        print(f'error during upload. check db for partial information: {e}')
        print('===============================')
        
print('completed upload to db')

completed upload to db


----------------