In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../data/presidential_election_dataset.csv')
df_description = pd.read_csv('../data/data_dictionary.csv')

In [None]:
df_description.head()

# # run the next line to see the types of columns
df_description.groupby('category').size()

In [None]:
# group features by category for easier access
idx = df_description[df_description['category'] == 'id']['feature'].values.tolist()
sex_age_edus = df_description[df_description['category'] == 'sex ~ age ~ education']['feature'].values.tolist()
sex_age_races = df_description[df_description['category'] == 'sex ~ age ~ race']['feature'].values.tolist()
sex_maritals = df_description[df_description['category'] == 'sex ~ marital status']['feature'].values.tolist()
households = df_description[df_description['category'] == 'household']['feature'].values.tolist()
labors = df_description[df_description['category'] == 'labor force']['feature'].values.tolist()
nativities = df_description[df_description['category'] == 'nativity']['feature'].values.tolist()
sexes = df_description[df_description['category'] == 'sex']['feature'].values.tolist()
incomes = df_description[df_description['category'] == 'income']['feature'].values.tolist()
targets = df_description[df_description['category'] == 'target']['feature'].values.tolist()

# combine all the lists into one
combined_list = sex_age_edus + sex_age_races + sex_maritals + targets + households + labors + nativities + sexes + incomes + idx

# all other columns
misc = list(set(df.columns) - set(combined_list))

In [None]:
# example of how to get statistic for chosen columns
df[households].describe()

In [None]:
households2 = [ 'households_income_under_10k',
 'households_income_10k_15k',
 'households_income_15k_25k',
 'households_income_25k_plus'] # leaving out households_total

# normalize househoulds 2 by total households
df[households2] = df[households2].div(df['households_total'], axis=0)

df[households]

In [None]:
# normalize total_votes of each county by total_votes for the year
df['county_vote_share'] = df['total_votes'] / df.groupby(['year'])['total_votes'].transform('sum')

# check that the sum of county_vote_share is 1 for each year
df.groupby(['year'])['county_vote_share'].sum()

In [None]:
# create a population density column
df['population_density'] = df['persons_total'] / df['land_area_sqkm']

# create a persons_per_household column
df['persons_per_household'] = df['persons_total'] / df['households_total']

In [None]:
#normalize the democrat, republican, and other columns by total_votes
df['democrat_prob'] = df['democrat'] / df['total_votes']
df['republican_prob'] = df['republican'] / df['total_votes']
df['other_prob'] = df['other'] / df['total_votes']

In [None]:
df[['democrat_prob', 'republican_prob', 'other_prob']]

In [None]:
# normalize the sexes columns by persons_total
df[sexes] = df[sexes].div(df['persons_total'], axis=0)
df[sexes]

In [None]:
sexes = ['male','female']
ages = ['18_24', 
        '25_34', 
        '35_44', 
        '45_64', 
        '65_plus']
edus = ['less_than_9th', 
        'some_hs', 
        'hs_grad', 
        'some_college', 
        'associates', 
        'bachelors', 
        'graduate']
races = ['black',
         'white',
         'aian',
         'asian',
         'nhpi',
         'multi',
         'other']

In [None]:
#let's compute a sex by race breakdown (age over 18) for each county by summing up the sex ~ age ~ race columns over the ages
for sex in sexes:
    for race in races:
        #create the name for the new sex ~ race column
        col_name = f'{sex}_{race}'

        #get a list of columns that begin with sex and end with race
        cols = [col for col in df.columns if col.startswith(sex) and col.endswith(race)]
        # print(cols)

        #sum the columns over the ages
        df[col_name] = df[cols].sum(axis=1)

# check that the new columns are correct
sex_race = [col for col in df.columns if any(col == f'{sex}_{race}' for sex in sexes for race in races)]
df[sex_race]

In [None]:
#make one dataframe for each year, put them in a dictionary with year as key
dfs = {}
dfs['2008'] = df[df['year'] == 2008]
dfs['2012'] = df[df['year'] == 2012]
dfs['2016'] = df[df['year'] == 2016]
dfs['2020'] = df[df['year'] == 2020]