# Importing Packages

In [153]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

# Collecting County Names

In [90]:
# using the dates of the 2016 and 2020 elections for the house of representatives
# november 3rd, 2020
url_2020 = 'https://vt.ncsbe.gov/RegStat/Results/?date=11%2F03%2F2020'
# novembr 8th, 2020
url_2016 = 'https://vt.ncsbe.gov/RegStat/Results/?date=11%2F08%2F2016'
def get_county_names(url):
    # empty list to put names in 
    names = []
    driver = webdriver.Chrome()
    driver.get(url)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'html.parser')
    # this gets all the elements with the county name
    for county in soup.find_all('a', target="_blank"):
        names.append(county.text)
    driver.close()
    return(names)

counties_2020 = get_county_names(url_2020)
counties_2016 = get_county_names(url_2016)
# checking if counties have changed since 
counties_2016==counties_2020

True

Counties are idenical in the provided vote registration data for 2016 and 2020 on the election dates, so the same list can be used in the data frame later.

In [91]:
len(counties_2016)

101

There 100 counties in North Carolina, with an extra measure added for the total numbers for the whole state.

# Collecting Voter Registration Information for Each County
Following information will be collected for each county: population of registered voters in the Constitution, Democratic, Green, Justice for All, Libertarian, No Labels, Republican, Unaffiliated, and We the People political parties. The number of people in White, Black, American Indian/Alaska Native, Native Hawaiian/Pacific Islander, Hispanic, and Other racial groups will also be colleted. The number of female and male 

In [150]:
def get_registration_info(url, county_list):
    # making empty dictionary with keys to ensure the counties allign with the right name
    registration_info = {county: None for county in county_list}
    # list of categories we are collecting , used for scraping
    categories = ['Total', 'Female', 'Male', 'White', 'Black', 'AmericanIndian', 'NativeHawaiian', 'Hispanic', 'Other', 'Democrats', 'Republicans', 'Unaffiliated', 'Green', 'Libertarians', 'Constitution', 'JusticeForAll', 'NoLabels', 'WeThePeople']
    # list of category labels for the dictionary
    category_labels = ['total', 'female', 'male', 'white', 'black', 'american_indian/alaska_native', 'native_hawaiian/pacific_islander', 'hispanic', 'other_race', 'democrat', 'republican', 'unaffiliated', 'green', 'libertarian', 'constitution', 'justice_for_all', 'no_labels', 'we_the_people'] 
    
    # getting all information from webpage to parse through 
    driver = webdriver.Chrome()
    driver.get(url)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'html.parser')
    
    # iterating through all counties again, but this time collecting all of the information
    #for county in soup.find_all('tr', class_="ui-ig-altrecord ui-iggrid-altrecord"):
    for county in soup.find_all('tr', attrs={'role':"row"})[1:]:
        county_name = county.find('a', attrs={'target': '_blank'}).text
        # empty dict for registration info 
        county_info = {}
        for index, category in enumerate(categories):
            if county_name == 'Totals':
                attr = 'gridGrandTotal_' + category
            else:
                attr = 'gridRegStat_' + category
            stat = county.find('td', attrs={'aria-describedby': attr}).text
            category_label = category_labels[index]
            county_info[category_label] = stat
        registration_info[county_name] = county_info
    driver.close()
    return(registration_info)


info_2020 = get_registration_info(url_2020, counties_2016)
info_2016  = get_registration_info(url_2016, counties_2016)

# Compiling Data into a Dataframe
Data will be organized so that each row will represent a county and each column will represent the provided demographics statistics on North Carolina voter registration, with an additional row representing the date the data was reported.

In [171]:
registration_2020 = pd.DataFrame.from_dict(info_2020, orient='index').map(lambda x: int(x.replace(',', ''))).reset_index()
registration_2016 = pd.DataFrame.from_dict(info_2020, orient='index').map(lambda x: int(x.replace(',', ''))).reset_index()
# adding date columns
registration_2020['date'] = '2020-11-03'
registration_2016['date'] = '2016-11-08'
registration_combined = pd.concat([registration_2016, registration_2020], ignore_index=True)
registration_combined.rename(columns={'index': 'county'}, inplace=True)
registration_combined

Unnamed: 0,county,total,female,male,white,black,american_indian/alaska_native,native_hawaiian/pacific_islander,hispanic,other_race,democrat,republican,unaffiliated,green,libertarian,constitution,justice_for_all,no_labels,we_the_people,date
0,ALAMANCE,110926,55668,45447,71870,21888,274,7,4398,16887,39209,36271,34695,31,654,66,0,0,0,2016-11-08
1,ALEXANDER,24953,12071,11217,21634,938,37,1,352,2343,5105,11775,7940,3,108,22,0,0,0,2016-11-08
2,ALLEGHANY,7609,3647,3359,6739,76,8,0,188,786,2156,2984,2429,4,34,2,0,0,0,2016-11-08
3,ANSON,16732,7542,6292,6709,6817,28,0,84,3178,10125,2867,3673,8,35,24,0,0,0,2016-11-08
4,ASHE,19545,9727,8723,17822,110,24,0,237,1589,4502,8875,6055,5,92,16,0,0,0,2016-11-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,WILKES,43934,21442,19339,38306,1619,48,0,748,3961,8229,23617,11875,11,180,22,0,0,0,2020-11-03
198,WILSON,57018,29789,23888,27397,23100,106,0,1435,6415,27794,14066,14889,15,219,35,0,0,0,2020-11-03
199,YADKIN,24725,12030,11031,21405,742,29,0,764,2549,3532,14061,6981,11,123,17,0,0,0,2020-11-03
200,YANCEY,14170,6927,6407,12879,79,21,0,91,1191,4435,5283,4379,5,61,7,0,0,0,2020-11-03


In [172]:
# writing to csvs
registration_2016.to_csv('2016_registration.csv', index=False) 
registration_2020.to_csv('2020_registration.csv', index=False) 
registration_combined.to_csv('registration_combined.csv', index=False) 

In [167]:
registration_combined.loc[:, registration_combined.columns != 'county']

Unnamed: 0,total,female,male,white,black,american_indian/alaska_native,native_hawaiian/pacific_islander,hispanic,other_race,democrat,republican,unaffiliated,green,libertarian,constitution,justice_for_all,no_labels,we_the_people,date
0,110926,55668,45447,71870,21888,274,7,4398,16887,39209,36271,34695,31,654,66,0,0,0,2016-11-08
1,24953,12071,11217,21634,938,37,1,352,2343,5105,11775,7940,3,108,22,0,0,0,2016-11-08
2,7609,3647,3359,6739,76,8,0,188,786,2156,2984,2429,4,34,2,0,0,0,2016-11-08
3,16732,7542,6292,6709,6817,28,0,84,3178,10125,2867,3673,8,35,24,0,0,0,2016-11-08
4,19545,9727,8723,17822,110,24,0,237,1589,4502,8875,6055,5,92,16,0,0,0,2016-11-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,43934,21442,19339,38306,1619,48,0,748,3961,8229,23617,11875,11,180,22,0,0,0,2020-11-03
198,57018,29789,23888,27397,23100,106,0,1435,6415,27794,14066,14889,15,219,35,0,0,0,2020-11-03
199,24725,12030,11031,21405,742,29,0,764,2549,3532,14061,6981,11,123,17,0,0,0,2020-11-03
200,14170,6927,6407,12879,79,21,0,91,1191,4435,5283,4379,5,61,7,0,0,0,2020-11-03


# Cross Checking Data with Individual Records 
The data provided through the 

In [174]:
df = pd.read_csv('voter_stats_20161108.txt', delimiter='\t')
df

Unnamed: 0,county_desc,election_date,stats_type,precinct_abbrv,vtd_abbrv,party_cd,race_code,ethnic_code,sex_code,age,total_voters,update_date
0,DAVIDSON,11/08/2016,voter,56,56,DEM,W,NL,F,Age 41 - 65,36,
1,CUMBERLAND,11/08/2016,voter,CC34,CC34,DEM,B,UN,F,Age 26 - 40,13,
2,CURRITUCK,11/08/2016,voter,KI,KI,DEM,W,UN,M,Age 41 - 65,15,
3,PASQUOTANK,11/08/2016,voter,EAST,1-B,DEM,B,UN,M,Age Over 66,16,
4,WAKE,11/08/2016,voter,20-06B,20-06,UNA,O,HL,F,Age 18 - 25,1,
...,...,...,...,...,...,...,...,...,...,...,...,...
514843,IREDELL,11/08/2016,voter,DV2-B,DV2-B,REP,W,NL,U,Age Over 66,1,
514844,CUMBERLAND,11/08/2016,voter,G6A,G6,REP,O,NL,M,Age 26 - 40,3,
514845,COLUMBUS,11/08/2016,voter,P06,P06,REP,W,UN,F,Age 41 - 65,4,
514846,PITT,11/08/2016,voter,1506,1506,UNA,B,UN,U,Age 18 - 25,11,
