# Scraping whatever's in that barrel
This notebook scrapes the data from http://results.thecaucuses.org

Scraping is a little difficult because the data is not in a standard HTML table, but a bunch of nested divs.  Nonetheless, beautifulsoup made quick work of it.

The result is a table with two nested indices.  The columns are nested by candidate and the type of vote tally, and the rows are nested by county and precinct.

Hope you find this useful!

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from requests import get

In [2]:
# Grab HTML
url = 'https://results.thecaucuses.org/'
response = get(url)

In [3]:
# Parse it
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

bs4.BeautifulSoup

In [4]:
# Get the headers columns indicating the candidate and the type of vote tally
header = [h.text for h in html_soup.find('ul', class_ = 'thead')]
subhead = [h.text for h in html_soup.find('ul', class_ = 'sub-head')]

new_header = []
for h, s in zip(header, subhead):
    if h:
        ch = h
    new_header.append(ch)

header = new_header[2:]
subhead = subhead[2:]

col_index = pd.MultiIndex.from_arrays([header, subhead], names=('candidate', 'subhead'))
col_index

MultiIndex(levels=[['Bennet', 'Biden', 'Bloomberg', 'Buttigieg', 'Delaney', 'Gabbard', 'Klobuchar', 'Other', 'Patrick', 'Sanders', 'Steyer', 'Uncommitted', 'Warren', 'Yang'], ['Final Expression', 'First Expression', 'SDE']],
           codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 8, 8, 8, 9, 9, 9, 10, 10, 10, 12, 12, 12, 13, 13, 13, 7, 7, 7, 11, 11, 11], [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2]],
           names=['candidate', 'subhead'])

In [5]:
# Pick out all of the divs that contain data for a single county
counties = html_soup.find_all('div', class_ = 'precinct-rows')

In [6]:
# Pull data for each precinct out, assembling the multiindex as it goes
ind_county = []
ind_precinct = []
votes = []

for row in counties:
    county_name, data = row.children
    county_name = county_name.text
    
    for row in data.children:
        dat = [c.text for c in row]
        
        ind_county.append(county_name)
        ind_precinct.append(dat[0])
        
        votes.append([float(d.replace(',', '')) for d in dat[1:]])

row_index = pd.MultiIndex.from_arrays([ind_county, ind_precinct], names=('county', 'precinct'))


In [7]:
# Convert the data into a dataframe
results = pd.DataFrame(votes, columns=col_index, index=row_index)
results

Unnamed: 0_level_0,candidate,Bennet,Bennet,Bennet,Biden,Biden,Biden,Bloomberg,Bloomberg,Bloomberg,Buttigieg,...,Warren,Yang,Yang,Yang,Other,Other,Other,Uncommitted,Uncommitted,Uncommitted
Unnamed: 0_level_1,subhead,First Expression,Final Expression,SDE,First Expression,Final Expression,SDE,First Expression,Final Expression,SDE,First Expression,...,SDE,First Expression,Final Expression,SDE,First Expression,Final Expression,SDE,First Expression,Final Expression,SDE
county,precinct,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Adair,4SE ORIENT,0.0,0.0,0.0,7.0,7.0,0.1569,0.0,0.0,0.0000,6.0,...,0.2353,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000
Adair,1NW ADAIR,0.0,0.0,0.0,6.0,6.0,0.0784,0.0,0.0,0.0000,8.0,...,0.1569,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000
Adair,2NE STUART,0.0,0.0,0.0,6.0,0.0,0.0000,0.0,0.0,0.0000,10.0,...,0.0000,1.0,1.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000
Adair,5GF GREENFIELD,0.0,0.0,0.0,8.0,0.0,0.0000,0.0,0.0,0.0000,10.0,...,0.0000,12.0,13.0,0.1569,0.0,0.0,0.0,0.0,0.0,0.0000
Adair,3SW FONTANELLE,0.0,0.0,0.0,9.0,9.0,0.1569,0.0,0.0,0.0000,0.0,...,0.0000,15.0,15.0,0.3922,0.0,0.0,0.0,0.0,0.0,0.0000
Adair,Total,0.0,0.0,0.0,36.0,22.0,0.3922,0.0,0.0,0.0000,34.0,...,0.3922,28.0,29.0,0.5491,0.0,0.0,0.0,0.0,0.0,0.0000
Adams,Adams 5,0.0,0.0,0.0,4.0,4.0,0.0857,0.0,0.0,0.0000,5.0,...,0.0000,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000
Adams,Adams 2,0.0,0.0,0.0,4.0,4.0,0.0857,0.0,0.0,0.0000,3.0,...,0.1714,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000
Adams,Adams 1,0.0,0.0,0.0,5.0,5.0,0.0857,0.0,0.0,0.0000,7.0,...,0.0857,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000
Adams,Adams 3,0.0,0.0,0.0,11.0,12.0,0.3429,0.0,0.0,0.0000,0.0,...,0.0000,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000


In [8]:
# save it!
results.to_csv('iowadems_resumts_0205_8PM.csv')

# And with a flattened row index (in case the one above is not readable)
results.reset_index().to_csv('iowadems_resumts_0205_8PM_rowunwrapped.csv')