### Read turnout data

In [1]:
import pandas as pd

In [2]:
!ls ../00_source_data

2008 Primary Elections - Turnout Rates.csv
2008_USStates_by_Race.csv
2012 Primary Elections - Turnout Rates.csv
2012_USStates_by_Race.csv
2016 Primary Elections - Turnout Rates.csv
2016_USStates_by_Race.csv
2020 Primary Elections - Turnout Rates.csv
2020_USStates_by_Race.csv
README.md


In [3]:
!ls ../20_intermediate_files

README.md      turnout.csv    turnout_v2.csv


In [4]:
df_to = pd.read_csv("../20_intermediate_files/turnout.csv")

In [5]:
df_to.head()

Unnamed: 0.1,Unnamed: 0,State,VEP_Counted,VEP,Minor,Cast,VAP,Democrat,Republican,Year,Counted,Type,Treatment
0,1,Iowa,16.1%,2196724,,354696.0,2284730,236000.0,118696,2008,,Caucus,
1,2,Wyoming,,387058,,,401306,,1200,2008,,Caucus,
2,3,New Hampshire,53.6%,988708,,529711.0,1021849,288672.0,241039,2008,,Primary,
3,4,Michigan,20.0%,7304120,,1463567.0,7627954,594398.0,869169,2008,,Primary,
4,5,Nevada,9.7%,1669337,,161874.0,1931679,117559.0,44315,2008,,Caucus,


### Read state population data by race from 2008 to 2016
* dataset from https://www.kff.org/other/state-indicator/distribution-by-raceethnicity/?dataView=1&currentTimeframe=10&sortModel=%7B%22colId%22:%22Location%22,%22sort%22:%22asc%22%7D based on the Census Bureau's American Community Survey, 2008-2018.

In [6]:
df_race_2008 = pd.read_csv("../00_source_data/2008_USStates_by_Race.csv", skiprows = 2)
df_race_2008['Year'] = 2008
df_race_2008.rename(columns={'Location':'State'}, inplace=True)
df_race_2008.head()

Unnamed: 0,State,White,Black,Hispanic,American Indian/Alaska Native,Asian,Native Hawaiian/Other Pacific Islander,Two Or More Races,Total,Footnotes,Year
0,United States,193573200.0,35033200.0,45753600.0,1919700.0,12947100.0,384400.0,5658400.0,295269800.0,1.0,2008
1,Alabama,3108700.0,1169700.0,124800.0,22200.0,43900.0,,56600.0,4526900.0,,2008
2,Alaska,433100.0,18700.0,34600.0,86900.0,30400.0,5600.0,45300.0,654700.0,,2008
3,Arizona,3704300.0,214500.0,1922100.0,261200.0,148600.0,6300.0,106000.0,6363000.0,,2008
4,Arkansas,2096900.0,422400.0,152500.0,15900.0,29100.0,1800.0,51200.0,2769700.0,,2008


In [7]:
df_race_2012 = pd.read_csv("../00_source_data/2012_USStates_by_Race.csv", skiprows = 2)
df_race_2012['Year'] = 2012
df_race_2012.rename(columns={'Location':'State'}, inplace=True)
df_race_2012.head()

Unnamed: 0,State,White,Black,Hispanic,American Indian/Alaska Native,Asian,Native Hawaiian/Other Pacific Islander,Two Or More Races,Total,Footnotes,Year
0,United States,192305000.0,36797500.0,51773400.0,2022800.0,15067900.0,472200.0,6937700.0,305376600.0,1.0,2012
1,Alabama,3145700.0,1220200.0,179400.0,23800.0,56700.0,,69100.0,4696000.0,,2012
2,Alaska,446300.0,23600.0,40300.0,100000.0,42000.0,6800.0,47200.0,706200.0,,2012
3,Arizona,3651000.0,247400.0,1920600.0,253800.0,185200.0,8400.0,120000.0,6386300.0,,2012
4,Arkansas,2122600.0,438300.0,192100.0,13700.0,35800.0,5000.0,52800.0,2860200.0,,2012


In [8]:
df_race_2016 = pd.read_csv("../00_source_data/2016_USStates_by_Race.csv", skiprows = 2)
df_race_2016['Year'] = 2016
df_race_2016.rename(columns={'Location':'State'}, inplace=True)
df_race_2016.head()

Unnamed: 0,State,White,Black,Hispanic,American Indian/Alaska Native,Asian,Native Hawaiian/Other Pacific Islander,Two Or More Races,Total,Footnotes,Year
0,United States,192537500.0,38081700.0,56144400.0,2041500.0,17004500.0,514600.0,8142200.0,314466400.0,1.0,2016
1,Alabama,3121800.0,1252500.0,194500.0,21400.0,57600.0,,81100.0,4730100.0,,2016
2,Alaska,439200.0,20700.0,47800.0,107200.0,42600.0,8800.0,48600.0,714900.0,,2016
3,Arizona,3754900.0,270000.0,2087500.0,266700.0,213600.0,11600.0,156600.0,6760900.0,,2016
4,Arkansas,2114900.0,441600.0,210000.0,17300.0,39500.0,7700.0,64500.0,2895400.0,,2016


### Read state population data by race of 2020
* this data set from another website, https://worldpopulationreview.com/states/states-by-race/, that is based on an estimate preformed in 2017 (https://www.census.gov/newsroom/press-releases/2017/estimates-idaho.html)
* so, this format is slightly different from those of 2008 - 2016
* especially, this dataset does not have Hispanic categories. It might be assigned to other races, such as white, black, and other races.

In [9]:
3317453+1293186+25576+64609+2182+70055+91619

4864680

In [10]:
df_race_2016.columns

Index(['State', 'White', 'Black', 'Hispanic', 'American Indian/Alaska Native',
       'Asian', 'Native Hawaiian/Other Pacific Islander', 'Two Or More Races',
       'Total', 'Footnotes', 'Year'],
      dtype='object')

In [11]:
df_race_2020 = pd.read_csv("../00_source_data/2020_USStates_by_Race.csv")
df_race_2020['Year'] = 2020
df_race_2020.rename(
    columns={'Native':'American Indian/Alaska Native',
             'Islander':'Native Hawaiian/Other Pacific Islander', 'TwoOrMoreRaces': 'Two Or More Races'}, inplace=True)
df_race_2020.head()

Unnamed: 0,State,Total,White,Black,American Indian/Alaska Native,Asian,Native Hawaiian/Other Pacific Islander,OtherRace,Two Or More Races,Year
0,Alabama,4864680,3317453,1293186,25576,64609,2182,70055,91619,2020
1,Alaska,738516,478834,24129,106660,46556,8849,11027,62461,2020
2,Arizona,6946685,5364141,305259,309580,228887,14112,471823,252883,2020
3,Arkansas,2990671,2302874,460970,20037,43988,7969,78981,75852,2020
4,California,39148760,23529068,2267875,296475,5604339,153366,5415410,1882227,2020


### Merge datasets of state population by race from 2008 to 2020

In [12]:
df_race_2008_2020 = pd.concat([df_race_2008, df_race_2012, df_race_2016, df_race_2020], sort = False)
df_race_2008_2020.head()

Unnamed: 0,State,White,Black,Hispanic,American Indian/Alaska Native,Asian,Native Hawaiian/Other Pacific Islander,Two Or More Races,Total,Footnotes,Year,OtherRace
0,United States,193573200.0,35033200.0,45753600.0,1919700.0,12947100.0,384400.0,5658400.0,295269800.0,1.0,2008,
1,Alabama,3108700.0,1169700.0,124800.0,22200.0,43900.0,,56600.0,4526900.0,,2008,
2,Alaska,433100.0,18700.0,34600.0,86900.0,30400.0,5600.0,45300.0,654700.0,,2008,
3,Arizona,3704300.0,214500.0,1922100.0,261200.0,148600.0,6300.0,106000.0,6363000.0,,2008,
4,Arkansas,2096900.0,422400.0,152500.0,15900.0,29100.0,1800.0,51200.0,2769700.0,,2008,


In [13]:
df_race_2008_2020.tail()

Unnamed: 0,State,White,Black,Hispanic,American Indian/Alaska Native,Asian,Native Hawaiian/Other Pacific Islander,Two Or More Races,Total,Footnotes,Year,OtherRace
46,Virginia,5722660.0,1613285.0,,22972.0,531503.0,5677.0,309213.0,8413774.0,,2020,208464.0
47,Washington,5545997.0,269854.0,,95048.0,607429.0,48043.0,416795.0,7294336.0,,2020,311170.0
48,West Virginia,1704345.0,66728.0,,3668.0,14534.0,350.0,32139.0,1829054.0,,2020,7290.0
49,Wisconsin,4945966.0,368744.0,,50422.0,159356.0,1975.0,135990.0,5778394.0,,2020,115941.0
50,Wyoming,532008.0,5540.0,,14053.0,4756.0,539.0,15579.0,581836.0,,2020,9361.0


### Merge turnout dataset and state population data by race from 2008 to 2020

In [14]:
turnout_v2 = pd.merge(df_to, df_race_2008_2020, how = 'left', on = ['State', 'Year'])

In [15]:
turnout_v2.drop(columns=['Unnamed: 0'], inplace=True)
turnout_v2

Unnamed: 0,State,VEP_Counted,VEP,Minor,Cast,VAP,Democrat,Republican,Year,Counted,...,White,Black,Hispanic,American Indian/Alaska Native,Asian,Native Hawaiian/Other Pacific Islander,Two Or More Races,Total,Footnotes,OtherRace
0,Iowa,16.1%,2196724,,354696,2284730,236000,118696,2008,,...,2609000.0,77500.0,118700.0,4300.0,43700.0,,39000.0,2894000.0,,
1,Wyoming,,387058,,,401306,,1200,2008,,...,446500.0,4500.0,41600.0,8900.0,2900.0,,11700.0,516100.0,,
2,New Hampshire,53.6%,988708,,529711,1021849,288672,241039,2008,,...,1185300.0,8000.0,37700.0,,26100.0,,15600.0,1275300.0,,
3,Michigan,20.0%,7304120,,1463567,7627954,594398,869169,2008,,...,7594800.0,1317400.0,396100.0,45900.0,217500.0,2400.0,190400.0,9764400.0,,
4,Nevada,9.7%,1669337,,161874,1931679,117559,44315,2008,,...,1447500.0,181000.0,664100.0,28400.0,153500.0,10900.0,71400.0,2556800.0,,
5,South Carolina,,3230163,,,3380941,,445499,2008,,...,2819300.0,1205500.0,167900.0,9600.0,45400.0,,69400.0,4318400.0,,
6,South Carolina,30.3%,3230163,,977650,3380941,532151,,2008,,...,2819300.0,1205500.0,167900.0,9600.0,45400.0,,69400.0,4318400.0,,
7,Florida,34.0%,12553134,,4268602,14299134,1749920,1949498,2008,,...,10772300.0,2598100.0,3771200.0,35500.0,410100.0,6300.0,280600.0,17874300.0,,
8,Maine,,1023959,,,1041335,,5338,2008,,...,1215200.0,9700.0,12000.0,6100.0,9200.0,,21500.0,1274100.0,,
9,Alabama,32.2%,3379070,,1088835,3523994,536626,552209,2008,,...,3108700.0,1169700.0,124800.0,22200.0,43900.0,,56600.0,4526900.0,,


### Write final merged data as turnout_v2.csv

In [16]:
turnout_v2.to_csv("../20_intermediate_files/turnout_v2.csv")

In [17]:
!ls ../20_intermediate_files/turnout_v2.csv

../20_intermediate_files/turnout_v2.csv


In [18]:
!cat ../20_intermediate_files/turnout_v2.csv

,State,VEP_Counted,VEP,Minor,Cast,VAP,Democrat,Republican,Year,Counted,Type,Treatment,White,Black,Hispanic,American Indian/Alaska Native,Asian,Native Hawaiian/Other Pacific Islander,Two Or More Races,Total,Footnotes,OtherRace
0,Iowa,16.1%,"2,196,724",,"354,696","2,284,730","236,000","118,696",2008,,Caucus,,2609000.0,77500.0,118700.0,4300.0,43700.0,,39000.0,2894000.0,,
1,Wyoming,,"387,058",,,"401,306",,"1,200",2008,,Caucus,,446500.0,4500.0,41600.0,8900.0,2900.0,,11700.0,516100.0,,
2,New Hampshire,53.6%,"988,708",,"529,711","1,021,849","288,672","241,039",2008,,Primary,,1185300.0,8000.0,37700.0,,26100.0,,15600.0,1275300.0,,
3,Michigan,20.0%,"7,304,120",,"1,463,567","7,627,954","594,398","869,169",2008,,Primary,,7594800.0,1317400.0,396100.0,45900.0,217500.0,2400.0,190400.0,9764400.0,,
4,Nevada,9.7%,"1,669,337",,"161,874","1,931,679","117,559","44,315",2008,,Caucus,,1447500.0,181000.0,664100.0,28400.0,153500.0,10900.0,71400.0,2556800.0,,
5,South Carolina,,"3,230,163",,,"3,380,941",,"