# Visualizing State by State Representation Ratios in Presidential Elections

In this notebook, I investigate how the number of citizens per presidential elector and number of registered voters per presidential elector vary by state and by political party.

Population data comes from the US census. https://www2.census.gov/programs-surveys/popest/datasets/

Electoral vote per state historical data comes from Wikipedia: https://en.wikipedia.org/wiki/United_States_Electoral_College#Chronological_table


In [82]:
import pandas as pd 

pd.set_option("styler.format.thousands", ",")
pd.options.display.float_format = '{:,.0f}'.format
# pd.set_option("styler.float_format", '{:,.0f}')

In [83]:
state_lookup = states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'ME*': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NE*': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

## Clean and combine by-state data including electoral votes, population, and results

### Electoral Votes per State per Year

In [81]:
## FIRST VERSION

# read in and clean electoral votes per state .csv
electoral_votes = pd.read_csv('electoral_votes.csv')

column = pd.concat([
    electoral_votes.loc[:, 'Unnamed: 1'],
    electoral_votes.loc[:, 'Unnamed: 2'],
    electoral_votes.loc[:, 'Unnamed: 3']
], ignore_index = True)

split_string = column.str.split(' - ')
states = [s[0] for s in split_string]
votes = [eval(s[1].split()[0]) for s in split_string]

# create df of above
votes_per_state = pd.DataFrame({'State': states, 'Votes': votes})
votes_per_state['Votes_without_Sens'] = votes_per_state['Votes'] - 2

In [145]:
year_prefix_map = {'17': range(0, 4), '18': range(4, 25), '19': range(25, 50), '20': range(50, 70)}
year_prefix_map_df = pd.DataFrame(year_prefix_map)
# year_prefix_map_inv = {range(0, 4): '17'}
electors.columns

ValueError: All arrays must be of the same length

In [163]:
def get_century_prefix(i: int) -> str:
    # takes i representing ith presidential election
    # returns corresponding century prefix as str
    
    if i < 4:
        return '17'
    elif i < 29:
        return '18'
    elif i < 54:
        return '19'
    else:
        return '20'

In [178]:
# read in and clean historical elector data
electors = pd.read_csv('historical_elector_data.csv', skiprows = 1).fillna(0)

# the objective is to create new columns so that every year has a column
# currently there is only one column per decade since redistricting only happens once per decade (per census)

electors = electors.rename(columns = {'Unnamed: 0': 'State_order', 'Unnamed: 1': 'State'})

# I feel like this is gonna be messy and suboptimal but the goal is to first get something that works!
# Update: it works!

# iterate over the column names (str)
# to_drop stores columns that are split into multiple cols, so we can drop the originals
to_drop = []
# keep track of ith election year
i = 0
# skip first two columns: State_founded and State
for col in electors.columns[2:]:
    years = col.split('\n')
    # if col_name contains more than one year
    if len(years) > 1:
        # create a new column for each year
        for year in years:
            i += 1
            year_clean = get_century_prefix(i) + year.split('.')[0].replace("'", "")
            electors[year_clean] = electors[col].copy()
        # add to list of columns to drop
        to_drop.append(col)
    else:
        i += 1
        year_clean = get_century_prefix(i) + years[0].split('.')[0].replace("'", "")
        electors = electors.rename({col: year_clean}, axis='columns')

electors = electors.drop(columns = to_drop)
electors = electors.drop([0,1])



### Population by State by Year


In [180]:
# read in and clean 2020s population per state .xlsx
pop_file = pd.read_excel(
    'NST-EST2023-POP.xlsx',
    skiprows = 8,
    names = ['State', 'Apr2020', 'Jul2020', 'Jul2021', 'Jul2022', 'Jul2023']
)
pop_file['State'] = pop_file['State'].str[1:]

pop_file = pop_file.drop(labels = range(51,59))

In [181]:
# join votes and pop dfs
by_state = votes_per_state.join(pop_file.iloc[:,1:])

# create pop per vote ratios
by_state['2020pop_per_vote'] = by_state['Jul2020'] / by_state['Votes']
by_state['2020pop_per_adj_vote'] = by_state['Jul2020'] / by_state['Votes_without_Sens']

In [8]:
by_state.sort_values('2020pop_per_vote', ascending = True)

Unnamed: 0,State,Votes,Votes_without_Sens,Apr2020,Jul2020,Jul2021,Jul2022,Jul2023,2020pop_per_vote,2020pop_per_adj_vote
50,Wyoming,3,1,576850,577664,579548,581629,584057,192555,577664
45,Vermont,3,1,643077,642936,647093,647110,647464,214312,642936
8,District of Columbia,3,1,689548,670839,669037,670949,678972,223613,670839
1,Alaska,3,1,733374,732964,734923,733276,733406,244321,732964
34,North Dakota,3,1,779079,779563,777982,778912,783926,259854,779563
26,Montana,4,2,1084244,1087211,1106366,1122878,1132812,271803,543606
39,Rhode Island,4,2,1097371,1096444,1097092,1093842,1095962,274111,548222
41,South Dakota,3,1,886668,887852,896299,909869,919318,295951,887852
7,Delaware,3,1,989946,991862,1004881,1019459,1031890,330621,991862
19,Maine,4,2,1363177,1364517,1378787,1389338,1395722,341129,682258


In [74]:
def clean_results(file_path, ref):
    """
    Loads and formats .xlsx that contains the election results for one year
    
    Args:
        file_path (str): absolute or relative path to .xlsx file
        ref (pd.DataFrame): look-up table of states and their abbreviations
    
    Returns:
        pd.DataFrame: cleaned data
    """
    
    results = pd.read_excel(
        file_path,
        sheet_name = 2,
        skiprows = 3,
        names = ['State_abbr', 'Electors_D', 'Electors_R', 'Pop_Vote_D', 'Pop_Vote_R', 'Pop_Vote_Other', 'Total_Vote'],
        skipfooter = 4,
        thousands = ','
    )
    
    # find the 0th index where State_abbr LIKE 'WY'
    end_idx = results.index[results['State_abbr'].str.match('WY', na=False)].tolist()[0]
    # drop all rows after WY line
    results = results.drop(range(end_idx+1, len(results)))
    
    # add State column with abbreviation mapping in ref
    results['State'] = results['State_abbr'].map(ref)
    
    #reorder columns
    results = results[['State', 'Electors_D', 'Electors_R', 'Pop_Vote_D', 'Pop_Vote_R', 'Pop_Vote_Other', 'Total_Vote']]
    
    # remove ** from Electors cols which correspond to faithless electors
    for col in ['Electors_D', 'Electors_R']:
        results.loc[:, col] = results.loc[:, col].astype(str).str.strip('*').str.replace('nan', '0').astype(int)
    
    # fill na values with 0
    results.loc[:, ['Electors_D', 'Electors_R']] = results.loc[:, ['Electors_D', 'Electors_R']].fillna(0)
    
    return results
    

In [75]:
results_2016 = clean_results('federalelections2016.xlsx', state_lookup)

### Questions to Answer

*What would election results look like if states didn't get an additional 2 electoral votes across the board?

*How does the rounding error in number of congressional reps vary by political party?

### Handling Copy-On-Write warning above

results_2020.loc[:, ['Electors_D', 'Electors_R']] creates a view/slice.

results_2020[['Electors_D', 'Electors_R']] creates a **new** DataFrame containing only those columns. This can be confirmed by checking their memory location with id().

In [78]:
# can't figure out how to properly format both ints and floats
# so, cast all to float (to get comma-separted thousands)
#results_2020.iloc[:, 3:] = results_2020.iloc[:, 3:].astype(float)