In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
# run the line below to load raw counts dataset
df = pd.read_csv('../data/dataset.csv')

# # run the line below to load the dataset with probabilities
# df = pd.read_csv('../data/probabilities.csv')

Below, you'll find a dictionary explaining all the variables in `../data/probabilities.csv`. The last four columns comprise (at least for me!) the targets for the classification task- they form a probability distribution of possible voting outcomes given the county. 

In [19]:
variable_dictionary = {
    # Basic identifiers
    "year": "Year of the data collection",
    "gisjoin": "Geographic identification code for joining with GIS data",
    "state": "US state name",
    "county": "US county name",
    
    # Base probability
    "P(C)": "Probability of living in county C",
    
    # Household data
    "P(households_income_under_10k|C)": "Conditional probability of households with income under $10,000 given county C",
    "P(households_income_10k_15k|C)": "Conditional probability of households with income between $10,000-$15,000 given county C",
    "P(households_income_15k_25k|C)": "Conditional probability of households with income between $15,000-$25,000 given county C",
    "P(households_income_25k_plus|C)": "Conditional probability of households with income $25,000 or more given county C",
    
    # Gender distribution
    "P(persons_male|C)": "Conditional probability of being male given county C",
    "P(persons_female|C)": "Conditional probability of being female given county C",
    
    # Marital status by gender
    "P(male_never_married|C)": "Conditional probability of males who never married given county C",
    "P(male_married|C)": "Conditional probability of married males given county C",
    "P(male_separated|C)": "Conditional probability of separated males given county C",
    "P(male_widowed|C)": "Conditional probability of widowed males given county C",
    "P(male_divorced|C)": "Conditional probability of divorced males given county C",
    "P(female_never_married|C)": "Conditional probability of females who never married given county C",
    "P(female_married|C)": "Conditional probability of married females given county C",
    "P(female_separated|C)": "Conditional probability of separated females given county C",
    "P(female_widowed|C)": "Conditional probability of widowed females given county C",
    "P(female_divorced|C)": "Conditional probability of divorced females given county C",
    
    # Education by gender and age group (Male)
    "P(male_18_24_less_than_9th|C)": "Conditional probability of males 18-24 with less than 9th grade education given county C",
    "P(male_18_24_some_hs|C)": "Conditional probability of males 18-24 with some high school education given county C",
    "P(male_18_24_hs_grad|C)": "Conditional probability of males 18-24 who are high school graduates given county C",
    "P(male_18_24_some_college|C)": "Conditional probability of males 18-24 with some college education given county C",
    "P(male_18_24_associates|C)": "Conditional probability of males 18-24 with associate's degree given county C",
    "P(male_18_24_bachelors|C)": "Conditional probability of males 18-24 with bachelor's degree given county C",
    "P(male_18_24_graduate|C)": "Conditional probability of males 18-24 with graduate degree given county C",
    
    "P(male_25_34_less_than_9th|C)": "Conditional probability of males 25-34 with less than 9th grade education given county C",
    "P(male_25_34_some_hs|C)": "Conditional probability of males 25-34 with some high school education given county C",
    "P(male_25_34_hs_grad|C)": "Conditional probability of males 25-34 who are high school graduates given county C",
    "P(male_25_34_some_college|C)": "Conditional probability of males 25-34 with some college education given county C",
    "P(male_25_34_associates|C)": "Conditional probability of males 25-34 with associate's degree given county C",
    "P(male_25_34_bachelors|C)": "Conditional probability of males 25-34 with bachelor's degree given county C",
    "P(male_25_34_graduate|C)": "Conditional probability of males 25-34 with graduate degree given county C",
    
    "P(male_35_44_less_than_9th|C)": "Conditional probability of males 35-44 with less than 9th grade education given county C",
    "P(male_35_44_some_hs|C)": "Conditional probability of males 35-44 with some high school education given county C",
    "P(male_35_44_hs_grad|C)": "Conditional probability of males 35-44 who are high school graduates given county C",
    "P(male_35_44_some_college|C)": "Conditional probability of males 35-44 with some college education given county C",
    "P(male_35_44_associates|C)": "Conditional probability of males 35-44 with associate's degree given county C",
    "P(male_35_44_bachelors|C)": "Conditional probability of males 35-44 with bachelor's degree given county C",
    "P(male_35_44_graduate|C)": "Conditional probability of males 35-44 with graduate degree given county C",
    
    "P(male_45_64_less_than_9th|C)": "Conditional probability of males 45-64 with less than 9th grade education given county C",
    "P(male_45_64_some_hs|C)": "Conditional probability of males 45-64 with some high school education given county C",
    "P(male_45_64_hs_grad|C)": "Conditional probability of males 45-64 who are high school graduates given county C",
    "P(male_45_64_some_college|C)": "Conditional probability of males 45-64 with some college education given county C",
    "P(male_45_64_associates|C)": "Conditional probability of males 45-64 with associate's degree given county C",
    "P(male_45_64_bachelors|C)": "Conditional probability of males 45-64 with bachelor's degree given county C",
    "P(male_45_64_graduate|C)": "Conditional probability of males 45-64 with graduate degree given county C",
    
    "P(male_65plus_less_than_9th|C)": "Conditional probability of males 65+ with less than 9th grade education given county C",
    "P(male_65plus_some_hs|C)": "Conditional probability of males 65+ with some high school education given county C",
    "P(male_65plus_hs_grad|C)": "Conditional probability of males 65+ who are high school graduates given county C",
    "P(male_65plus_some_college|C)": "Conditional probability of males 65+ with some college education given county C",
    "P(male_65plus_associates|C)": "Conditional probability of males 65+ with associate's degree given county C",
    "P(male_65plus_bachelors|C)": "Conditional probability of males 65+ with bachelor's degree given county C",
    "P(male_65plus_graduate|C)": "Conditional probability of males 65+ with graduate degree given county C",
    
    # Education by gender and age group (Female)
    "P(female_18_24_less_than_9th|C)": "Conditional probability of females 18-24 with less than 9th grade education given county C",
    "P(female_18_24_some_hs|C)": "Conditional probability of females 18-24 with some high school education given county C",
    "P(female_18_24_hs_grad|C)": "Conditional probability of females 18-24 who are high school graduates given county C",
    "P(female_18_24_some_college|C)": "Conditional probability of females 18-24 with some college education given county C",
    "P(female_18_24_associates|C)": "Conditional probability of females 18-24 with associate's degree given county C",
    "P(female_18_24_bachelors|C)": "Conditional probability of females 18-24 with bachelor's degree given county C",
    "P(female_18_24_graduate|C)": "Conditional probability of females 18-24 with graduate degree given county C",
    
    "P(female_25_34_less_than_9th|C)": "Conditional probability of females 25-34 with less than 9th grade education given county C",
    "P(female_25_34_some_hs|C)": "Conditional probability of females 25-34 with some high school education given county C",
    "P(female_25_34_hs_grad|C)": "Conditional probability of females 25-34 who are high school graduates given county C",
    "P(female_25_34_some_college|C)": "Conditional probability of females 25-34 with some college education given county C",
    "P(female_25_34_associates|C)": "Conditional probability of females 25-34 with associate's degree given county C",
    "P(female_25_34_bachelors|C)": "Conditional probability of females 25-34 with bachelor's degree given county C",
    "P(female_25_34_graduate|C)": "Conditional probability of females 25-34 with graduate degree given county C",
    
    "P(female_35_44_less_than_9th|C)": "Conditional probability of females 35-44 with less than 9th grade education given county C",
    "P(female_35_44_some_hs|C)": "Conditional probability of females 35-44 with some high school education given county C",
    "P(female_35_44_hs_grad|C)": "Conditional probability of females 35-44 who are high school graduates given county C",
    "P(female_35_44_some_college|C)": "Conditional probability of females 35-44 with some college education given county C",
    "P(female_35_44_associates|C)": "Conditional probability of females 35-44 with associate's degree given county C",
    "P(female_35_44_bachelors|C)": "Conditional probability of females 35-44 with bachelor's degree given county C",
    "P(female_35_44_graduate|C)": "Conditional probability of females 35-44 with graduate degree given county C",
    
    "P(female_45_64_less_than_9th|C)": "Conditional probability of females 45-64 with less than 9th grade education given county C",
    "P(female_45_64_some_hs|C)": "Conditional probability of females 45-64 with some high school education given county C",
    "P(female_45_64_hs_grad|C)": "Conditional probability of females 45-64 who are high school graduates given county C",
    "P(female_45_64_some_college|C)": "Conditional probability of females 45-64 with some college education given county C",
    "P(female_45_64_associates|C)": "Conditional probability of females 45-64 with associate's degree given county C",
    "P(female_45_64_bachelors|C)": "Conditional probability of females 45-64 with bachelor's degree given county C",
    "P(female_45_64_graduate|C)": "Conditional probability of females 45-64 with graduate degree given county C",
    
    "P(female_65plus_less_than_9th|C)": "Conditional probability of females 65+ with less than 9th grade education given county C",
    "P(female_65plus_some_hs|C)": "Conditional probability of females 65+ with some high school education given county C",
    "P(female_65plus_hs_grad|C)": "Conditional probability of females 65+ who are high school graduates given county C",
    "P(female_65plus_some_college|C)": "Conditional probability of females 65+ with some college education given county C",
    "P(female_65plus_associates|C)": "Conditional probability of females 65+ with associate's degree given county C",
    "P(female_65plus_bachelors|C)": "Conditional probability of females 65+ with bachelor's degree given county C",
    "P(female_65plus_graduate|C)": "Conditional probability of females 65+ with graduate degree given county C",
    
    # Race/Ethnicity by gender
    "P(male_white|C)": "Conditional probability of white males given county C",
    "P(female_white|C)": "Conditional probability of white females given county C",
    "P(male_black|C)": "Conditional probability of Black/African American males given county C",
    "P(female_black|C)": "Conditional probability of Black/African American females given county C",
    "P(male_aian|C)": "Conditional probability of American Indian/Alaska Native males given county C",
    "P(female_aian|C)": "Conditional probability of American Indian/Alaska Native females given county C",
    "P(male_asian|C)": "Conditional probability of Asian males given county C",
    "P(female_asian|C)": "Conditional probability of Asian females given county C",
    "P(male_nhpi|C)": "Conditional probability of Native Hawaiian/Pacific Islander males given county C",
    "P(female_nhpi|C)": "Conditional probability of Native Hawaiian/Pacific Islander females given county C",
    "P(male_other|C)": "Conditional probability of males of other races given county C",
    "P(female_other|C)": "Conditional probability of females of other races given county C",
    "P(male_multi|C)": "Conditional probability of multiracial males given county C",
    "P(female_multi|C)": "Conditional probability of multiracial females given county C",
    
    # Nativity
    "P(persons_native|C)": "Conditional probability of native-born persons given county C",
    "P(persons_foreign_born|C)": "Conditional probability of foreign-born persons given county C",
    
    # Labor force
    "P(labor_force_total|C)": "Conditional probability of total labor force given county C",
    "P(labor_force_armed|C)": "Conditional probability of armed forces participation given county C",
    "P(labor_force_civilian|C)": "Conditional probability of civilian labor force participation given county C",
    "P(labor_force_employed|C)": "Conditional probability of being employed given county C",
    "P(labor_force_unemployed|C)": "Conditional probability of being unemployed given county C",
    "P(not_in_labor_force|C)": "Conditional probability of not being in the labor force given county C",
    
    # Other demographics
    "P(persons_hispanic|C)": "Conditional probability of being Hispanic/Latino given county C",
    "P(persons_below_poverty|C)": "Conditional probability of living below poverty line given county C",
    
    # Voting patterns
    "P(democrat|C)": "Conditional probability of voting Democrat given county C",
    "P(other|C)": "Conditional probability of voting for third party given county C",
    "P(republican|C)": "Conditional probability of voting Republican given county C",
    "P(non_voter|C)": "Conditional probability of not voting given county C"
}

The columns of `../data/dataset.csv` are given below. For each column `col` apart from an id column,  the corresponding column in `../data/probabilities.csv` is named `f'P({col}|C)`. 

In [20]:
idx = ['year',
        'gisjoin',
        'state',
        'county']

sex_maritals = ['male_never_married',
                'male_married',
                'male_separated',
                'male_widowed',
                'male_divorced',
                'female_never_married',
                'female_married',
                'female_separated',
                'female_widowed',
                'female_divorced']

nativities = ['persons_native', 
              'persons_foreign_born']

sex_age_edus = ['male_18_24_less_than_9th',
                'male_18_24_some_hs',
                'male_18_24_hs_grad',
                'male_18_24_some_college',
                'male_18_24_associates',
                'male_18_24_bachelors',
                'male_18_24_graduate',
                'male_25_34_less_than_9th',
                'male_25_34_some_hs',
                'male_25_34_hs_grad',
                'male_25_34_some_college',
                'male_25_34_associates',
                'male_25_34_bachelors',
                'male_25_34_graduate',
                'male_35_44_less_than_9th',
                'male_35_44_some_hs',
                'male_35_44_hs_grad',
                'male_35_44_some_college',
                'male_35_44_associates',
                'male_35_44_bachelors',
                'male_35_44_graduate',
                'male_45_64_less_than_9th',
                'male_45_64_some_hs',
                'male_45_64_hs_grad',
                'male_45_64_some_college',
                'male_45_64_associates',
                'male_45_64_bachelors',
                'male_45_64_graduate',
                'male_65plus_less_than_9th',
                'male_65plus_some_hs',
                'male_65plus_hs_grad',
                'male_65plus_some_college',
                'male_65plus_associates',
                'male_65plus_bachelors',
                'male_65plus_graduate',
                'female_18_24_less_than_9th',
                'female_18_24_some_hs',
                'female_18_24_hs_grad',
                'female_18_24_some_college',
                'female_18_24_associates',
                'female_18_24_bachelors',
                'female_18_24_graduate',
                'female_25_34_less_than_9th',
                'female_25_34_some_hs',
                'female_25_34_hs_grad',
                'female_25_34_some_college',
                'female_25_34_associates',
                'female_25_34_bachelors',
                'female_25_34_graduate',
                'female_35_44_less_than_9th',
                'female_35_44_some_hs',
                'female_35_44_hs_grad',
                'female_35_44_some_college',
                'female_35_44_associates',
                'female_35_44_bachelors',
                'female_35_44_graduate',
                'female_45_64_less_than_9th',
                'female_45_64_some_hs',
                'female_45_64_hs_grad',
                'female_45_64_some_college',
                'female_45_64_associates',
                'female_45_64_bachelors',
                'female_45_64_graduate',
                'female_65plus_less_than_9th',
                'female_65plus_some_hs',
                'female_65plus_hs_grad',
                'female_65plus_some_college',
                'female_65plus_associates',
                'female_65plus_bachelors',
                'female_65plus_graduate']

labors = ['labor_force_total',
            'labor_force_armed',
            'labor_force_civilian',
            'labor_force_employed',
            'labor_force_unemployed',
            'not_in_labor_force']

households = ['households_total',
              'households_income_under_10k', 
              'households_income_10k_15k',
              'households_income_15k_25k',
              'households_income_25k_plus']

sexes = ['persons_male',
         'persons_female']

incomes = ['median_household_income',
           'per_capita_income']

sex_races = ['male_white',
            'female_white',
            'male_black',
            'female_black',
            'male_aian',
            'female_aian',
            'male_asian',
            'female_asian',
            'male_nhpi',
            'female_nhpi',
            'male_other',
            'female_other',
            'male_multi',
            'female_multi']

targets = ['democrat',
        'other',
        'republican',
        'non_voter']

misc_persons = ['persons_hispanic',
                'persons_below_poverty']

land = ['land_area_sqkm']

# combine all columns which contain features that count persons
person_cols = sexes + sex_maritals + sex_age_edus + sex_races + nativities +  labors + misc_persons + targets

len(person_cols)

110

In [21]:
# example of how to get statistic for chosen columns
df[households].describe()

Unnamed: 0,households_total,households_income_under_10k,households_income_10k_15k,households_income_15k_25k,households_income_25k_plus
count,12360.0,12360.0,12360.0,12360.0,12360.0
mean,37994.38,2436.611812,1819.751214,3578.36246,30159.66
std,115355.7,7589.645197,5452.44759,10362.197306,93265.82
min,80.0,0.0,0.0,0.0,62.0
25%,4340.25,305.0,275.0,533.0,3088.75
50%,10020.5,740.0,630.0,1199.0,7319.0
75%,25941.75,1802.25,1445.0,2807.75,19622.75
max,3363093.0,212332.0,189719.0,345315.0,2846712.0


In [22]:
households2 = [ 'households_income_under_10k',
 'households_income_10k_15k',
 'households_income_15k_25k',
 'households_income_25k_plus'] # leaving out households_total

# normalize househoulds 2 by total households
df[households2] = df[households2].div(df['households_total'], axis=0)

df[households]

Unnamed: 0,households_total,households_income_under_10k,households_income_10k_15k,households_income_15k_25k,households_income_25k_plus
0,19718,0.066741,0.050512,0.087433,0.795314
1,69476,0.059675,0.048261,0.118775,0.773289
2,9795,0.173047,0.098418,0.139867,0.588668
3,7441,0.073512,0.073512,0.151458,0.701519
4,20605,0.084543,0.060180,0.130114,0.725164
...,...,...,...,...,...
12355,16335,0.052158,0.028344,0.066728,0.852770
12356,9645,0.018766,0.020943,0.058994,0.901296
12357,7586,0.020301,0.033087,0.052597,0.894015
12358,3414,0.037493,0.032220,0.106913,0.823374


In [23]:
# make a column with total_votes for each county in each year
df['total_votes'] = df[['democrat','other','republican']].sum(axis=1)

# normalize total_votes of each county by total_votes for the year
df['county_vote_share'] = df['total_votes'] / df.groupby(['year'])['total_votes'].transform('sum')

# check that the sum of county_vote_share is 1 for each year
df.groupby(['year'])['county_vote_share'].sum()

year
2008    1.0
2012    1.0
2016    1.0
2020    1.0
Name: county_vote_share, dtype: float64

In [24]:
# create a population density column
df['population_density'] = df['persons_total'] / df['land_area_sqkm']

# create a persons_per_household column
df['persons_per_household'] = df['persons_total'] / df['households_total']

In [25]:
#normalize the democrat, republican, and other columns by total_votes
df['democrat_prob'] = df['democrat'] / df['total_votes']
df['republican_prob'] = df['republican'] / df['total_votes']
df['other_prob'] = df['other'] / df['total_votes']

In [26]:
df[['democrat_prob', 'republican_prob', 'other_prob']]

Unnamed: 0,democrat_prob,republican_prob,other_prob
0,0.257730,0.736136,0.006133
1,0.238119,0.752595,0.009286
2,0.489854,0.504385,0.005761
3,0.265965,0.724433,0.009602
4,0.145135,0.840195,0.014670
...,...,...,...
12355,0.228950,0.732363,0.038687
12356,0.665990,0.293569,0.040441
12357,0.168200,0.792473,0.039328
12358,0.161458,0.804812,0.033730


In [27]:
# normalize the sexes columns by persons_total
df[sexes] = df[sexes].div(df['persons_total'], axis=0)
df[sexes]

Unnamed: 0,persons_male,persons_female
0,0.484997,0.515003
1,0.488660,0.511340
2,0.528972,0.471028
3,0.537904,0.462096
4,0.495308,0.504692
...,...,...
12355,0.517907,0.482093
12356,0.518890,0.481110
12357,0.508420,0.491580
12358,0.523625,0.476375


In [28]:
sexes = ['male','female']
ages = ['18_24', 
        '25_34', 
        '35_44', 
        '45_64', 
        '65_plus']
edus = ['less_than_9th', 
        'some_hs', 
        'hs_grad', 
        'some_college', 
        'associates', 
        'bachelors', 
        'graduate']
races = ['black',
         'white',
         'aian',
         'asian',
         'nhpi',
         'multi',
         'other']

In [29]:
#let's compute a sex by edu breakdown (given the age is over 18) for each county by summing up the sex_age_edu columns over the ages
for sex in sexes:
    for edu in edus:
        #create the name for the new sex ~ race column
        col_name = f'{sex}_{edu}'

        #get a list of columns that begin with sex and end with race
        cols = [col for col in df.columns if col.startswith(sex) and col.endswith(edu)]
        # print(cols)

        #sum the columns over the ages
        df[col_name] = df[cols].sum(axis=1)

# check that the new columns are correct
sex_edu = [col for col in df.columns if any(col == f'{sex}_{edu}' for sex in sexes for edu in edus)]
df[sex_edu]

Unnamed: 0,male_less_than_9th,male_some_hs,male_hs_grad,male_some_college,male_associates,male_bachelors,male_graduate,female_less_than_9th,female_some_hs,female_hs_grad,female_some_college,female_associates,female_bachelors,female_graduate
0,849,2118,6370,4284,834,2527,1319,883,2181,7067,4716,1466,2702,1114
1,2709,7052,19885,14701,3633,11922,5192,2287,5833,21394,17611,6194,10929,5475
2,1039,2437,4264,1878,573,706,569,890,1782,3548,1639,845,734,565
3,825,1826,4071,1188,457,696,146,648,1134,3241,1967,376,497,240
4,1874,3520,7789,4182,987,1710,700,1806,3471,7860,4627,1710,1520,888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12355,342,820,6311,4345,1798,1573,1111,332,827,4417,3850,2603,2036,990
12356,233,255,1418,2130,151,3968,1824,191,237,1922,1390,261,3244,2007
12357,144,379,3165,2037,573,787,448,139,358,2315,2116,1000,1035,312
12358,45,220,958,1078,181,283,289,34,139,756,984,431,491,146


In [None]:
#make one dataframe for each year, put them in a dictionary with year as key
dfs = {}
dfs['2008'] = df[df['year'] == 2008]
dfs['2012'] = df[df['year'] == 2012]
dfs['2016'] = df[df['year'] == 2016]
dfs['2020'] = df[df['year'] == 2020]