# Assignment 2
**Data Science for Quantitative Finance**  
**Bailey Griswold**  
**September 17, 2017**  

Using BeautifulSoup scrape the state level for both parties and output the results in a df_results
DataFrame indexed by  
- electiontype: one of ['Caucus', 'Primary'] 
- state: state of the election  
- date: date at which the election took place
- party: one of ['GOP', 'DEM']  

and containing the following columns
- candidate: candidate name
- votes: votes the candidate won
- %votes: %votes the candidate won
- delegates: delegates that candidate won 

In [1]:
# import packages
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from fake_useragent import UserAgent

In [2]:
# website that contains all the state level primary election information
url = 'http://www.politico.com/mapdata-2016/2016-election/primary/results/map/president/'

In [3]:
# install fake-useragent https://pypi.python.org/pypi/fake-useragent

ua = UserAgent()

In [4]:
ua.chrome

'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36'

In [5]:
header = {'User-Agent':str(ua.chrome)}
header

{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17'}

In [6]:
# get the webpage using the fake-useragent header above
htmlContent = requests.get(url, headers=header)
htmlContent.text[:1000]

'<!DOCTYPE html>\n<!--[if lt IE 7]><html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->\n<!--[if IE 7]><html class="no-js lt-ie9 lt-ie8"> <![endif]-->\n<!--[if IE 8]><html class="no-js lt-ie9"> <![endif]-->\n<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->\n\n<head>\n    <meta charset="utf-8" />\n    \n<link rel="stylesheet" type="text/css" href="http://static.politico.com/resource/assets/css/style-core.min.9aafe18e3e5e0097e2a0a0187d47818f.gz.css">\n<link rel="stylesheet" type="text/css" href="http://static.politico.com/resource/assets/css/style-election-2016.min.fe48e4432f8bf5e6002b91e519d2e55d.gz.css">\n<title>2016 Primary Election Results: President Live Map by State, Real-Time Voting Updates - POLITICO</title>\n<meta property="og:title" content="2016 Primary Election Results: President Live Map by State, Real-Time Voting Updates">\n<meta name="description" content="POLITICO&#39;s Live 2016 Primary Election Results and Maps by State, County and District. Includes 2016 

### make a beautiful soup object of downloaded webpage content

In [7]:
# create a BeautifulSoup object that parses the html
soup = BeautifulSoup(htmlContent.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]><html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]><html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]><html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <link href="http://static.politico.com/resource/assets/css/style-core.min.9aafe18e3e5e0097e2a0a0187d47818f.gz.css" rel="stylesheet" type="text/css">
   <link href="http://static.politico.com/resource/assets/css/style-election-2016.min.fe48e4432f8bf5e6002b91e519d2e55d.gz.css" rel="stylesheet" type="text/css">
    <title>
     2016 Primary Election Results: President Live Map by State, Real-Time Voting Updates - POLITICO
    </title>
    <meta content="2016 Primary Election Results: President Live Map by State, Real-Time Voting Updates" property="og:title">
     <meta content="POLITICO's Live 2016 Primary Election Results and Maps by State, County and District. Includes 2016 Primary Races f

### parse beautiful soup object for presidential caucus results

In [8]:
df_results_cauc = pd.DataFrame()
for i in range(51):
    qs = soup.find_all('article', "timeline-group")[i]
    state = qs.a.text.strip("\n").strip(" ")
    electiontype = qs.find_all("a")[1].text.strip("\n").strip(" ")
    #print (state, electiontype)
    
    if electiontype == "Presidential Caucus":
        
        tables = qs.find_all('table')
        [x.get_text() for x in tables]
        out = []
        for table in tables:
            tout = []
            for row in table.find_all('tr'):
                rout = []
                rout.append(row.find('span').get_text())
                for column in row.find_all('td'):
                    rout.append(column.get_text())
                tout.append(rout)
            out.append(tout)
        
        details = qs.find_all('div', "title")
        for i in np.arange(len(details)):
            if details[i].find_all(text='Democratic') == ['Democratic']:
                party = 'DEM'
                electiontype = details[i-1].find("a").text.strip("\n").strip(" ")
                date = details[i-1].time['datetime']
                demresults = pd.DataFrame(out[(int((i-1)/2))])
                demresults['party'] = 'DEM'
                demresults['electiontype'] = electiontype
                demresults['state'] = state
                demresults['date'] = date
                df_results_cauc = df_results_cauc.append(demresults, ignore_index=True)
            if details[i].find_all(text='Republican') == ['Republican']:
                party = 'GOP'
                electiontype = details[i-1].find("a").text.strip("\n").strip(" ")
                date = details[i-1].time['datetime']
                gopresults = pd.DataFrame(out[(int((i-1)/2))])
                gopresults['party'] = 'GOP'
                gopresults['electiontype'] = electiontype
                gopresults['state'] = state
                gopresults['date'] = date
                df_results_cauc = df_results_cauc.append(gopresults, ignore_index=True)

In [9]:
df_results_cauc

Unnamed: 0,0,1,2,3,party,electiontype,state,date
0,Winner T. Cruz,36.4%,7973,12,GOP,Presidential Caucus,Alaska,2016-03-01
1,D. Trump,33.5%,7346,11,GOP,Presidential Caucus,Alaska,2016-03-01
2,M. Rubio,15.1%,3318,5,GOP,Presidential Caucus,Alaska,2016-03-01
3,B. Carson,10.9%,2401,,GOP,Presidential Caucus,Alaska,2016-03-01
4,J. Kasich,4.1%,892,,GOP,Presidential Caucus,Alaska,2016-03-01
5,Uncommitted,0.0%,0,,GOP,Presidential Caucus,Alaska,2016-03-01
6,Winner B. Sanders,81.6%,440,14,DEM,Presidential Caucus,Alaska,2016-03-26
7,H. Clinton,18.4%,99,4,DEM,Presidential Caucus,Alaska,2016-03-26
8,R. De La Fuente,0.0%,0,,DEM,Presidential Caucus,Alaska,2016-03-26
9,Uncommitted,0.0%,0,2,DEM,Presidential Caucus,Alaska,2016-03-26


### parse election results labeled "presidential primaries" and "presidential caucuses"
Doing this because the results are arranged differently for presidential primaries and democratic and republican caucuses that happened together.

In [10]:
df_results_prim = pd.DataFrame()
for i in range(51):
    qs = soup.find_all('article', "timeline-group")[i]
    state = qs.a.text.strip("\n").strip(" ")
    electiontype = qs.find_all("a")[1].text.strip("\n").strip(" ")
    date = qs.time['datetime']
    
    if electiontype == "Presidential Primaries" or  electiontype == "Presidential Caucuses":
        tables = qs.find_all('table')
        [x.get_text() for x in tables]
        out = []
        for table in tables:
            tout = []
            for row in table.find_all('tr'):
                rout = []
                rout.append(row.find('span').get_text())
                for column in row.find_all('td'):
                    rout.append(column.get_text())
                tout.append(rout)
            out.append(tout)
            
        demresults = pd.DataFrame(out[0])
        demresults['party'] = 'DEM'
        demresults['electiontype'] = electiontype
        demresults['state'] = state
        demresults['date'] = date
        df_results_prim = df_results_prim.append(demresults, ignore_index=True)
        gopresults = pd.DataFrame(out[1])
        gopresults['party'] = 'GOP'
        gopresults['electiontype'] = electiontype
        gopresults['state'] = state
        gopresults['date'] = date
        df_results_prim = df_results_prim.append(gopresults, ignore_index=True)

In [11]:
df_results_prim

Unnamed: 0,0,1,2,3,party,electiontype,state,date
0,Winner H. Clinton,77.8%,309928,51,DEM,Presidential Primaries,Alabama,2016-03-01
1,B. Sanders,19.2%,76399,9,DEM,Presidential Primaries,Alabama,2016-03-01
2,Uncommitted,2.4%,9534,,DEM,Presidential Primaries,Alabama,2016-03-01
3,M. O'Malley,0.4%,1489,,DEM,Presidential Primaries,Alabama,2016-03-01
4,R. De La Fuente,0.2%,814,,DEM,Presidential Primaries,Alabama,2016-03-01
5,Winner D. Trump,43.4%,371735,36,GOP,Presidential Primaries,Alabama,2016-03-01
6,T. Cruz,21.1%,180608,13,GOP,Presidential Primaries,Alabama,2016-03-01
7,M. Rubio,18.7%,159802,1,GOP,Presidential Primaries,Alabama,2016-03-01
8,B. Carson,10.2%,87517,,GOP,Presidential Primaries,Alabama,2016-03-01
9,J. Kasich,4.4%,37970,,GOP,Presidential Primaries,Alabama,2016-03-01


# Append dateframe of presidential primaries to dataframe of presidential caucuses

In [12]:
print ("length of primaries dataframe: {}".format(len(df_results_prim)))
print ("length of caucus dataframe: {}".format(len(df_results_cauc)))
df_results = df_results_prim.append(df_results_cauc, ignore_index=True)
print ("length of results dataframe: {}".format(len(df_results)))

length of primaries dataframe: 590
length of caucus dataframe: 134
length of results dataframe: 724


In [13]:
df_results.rename(columns={0: 'candidate', 1: '%votes', 2: 'votes', 3:'delegates'}, inplace=True)

In [14]:
df_results.sort(['electiontype', 'state', 'date','party'], inplace=True)

  if __name__ == '__main__':


In [15]:
df_results.set_index(['electiontype', 'state', 'date', 'party'], inplace=True)

In [16]:
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,candidate,%votes,votes,delegates
electiontype,state,date,party,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Presidential Caucus,Alaska,2016-03-01,GOP,Winner T. Cruz,36.4%,7973,12
Presidential Caucus,Alaska,2016-03-01,GOP,D. Trump,33.5%,7346,11
Presidential Caucus,Alaska,2016-03-01,GOP,M. Rubio,15.1%,3318,5
Presidential Caucus,Alaska,2016-03-01,GOP,B. Carson,10.9%,2401,
Presidential Caucus,Alaska,2016-03-01,GOP,J. Kasich,4.1%,892,
Presidential Caucus,Alaska,2016-03-01,GOP,Uncommitted,0.0%,0,
Presidential Caucus,Alaska,2016-03-26,DEM,Winner B. Sanders,81.6%,440,14
Presidential Caucus,Alaska,2016-03-26,DEM,H. Clinton,18.4%,99,4
Presidential Caucus,Alaska,2016-03-26,DEM,R. De La Fuente,0.0%,0,
Presidential Caucus,Alaska,2016-03-26,DEM,Uncommitted,0.0%,0,2
