Script: clean_presidential_elections.ipynb
Author: Revekka Gershovich reworking Michel Gutmann's code
Purpose: Original code clean_presidential_elections.py aims to generate a state-election year dataset with presidential election results by party from David Leip's dataset. This code fixes problems with cleaning dataset for years 2008 and 1944, and adds data for 2016 and 2020 from Harvard Dataverse dataset. 
The additional dataset can be found here: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/42MVDX and in '/StateLaws/2_data/1_raw/political_data/dataverse_files' folder on Dropbox

In [145]:
import pandas as pd
import us
import os
import os.path as path
import numpy as np

In [146]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(data_dir), "Data directory does not exist"
input_file = "./2_data/1_raw/political_data/StateLevelData.xlsx"
assert os.path.exists(input_file), "Input file does not exist"
output_file = os.path.join(data_dir, "./statelevel_preselection_results.csv")
assert os.path.exists(output_file), "Output file does not exist"
raw_data_dir = "./2_data/1_raw/political_data"
assert os.path.exists(raw_data_dir), "Raw data directory does not exist"

In [147]:
##### DEFINE MAIN CLEANING FUNCTION #####
def clean_election(df, key):
    print(key)
    if key == "2008":
        # Find the index of "Unnamed: 60"
        cutoff_index = df.columns.get_loc('Unnamed: 60')

        # Keep all columns up to and including "Unnamed: 60"
        df = df.iloc[:, :cutoff_index + 1]
    if key == "1944":
        cutoff_index = df.columns.get_loc('Unnamed: 28')
        df = df.iloc[:, :cutoff_index + 1]
    # Standardize format by keeping only raw result columns
    i = 0
    cols_to_keep = ["Unnamed: 0"]
    for col in df.columns:
        if i == 1:
            cols_to_keep.append(col)
        elif col == "% Total Vote":
            i = 1
    cols_to_keep = cols_to_keep[:-2]
    df_filtered = df[cols_to_keep].copy()
    df_filtered.rename({"Unnamed: 0": "state"}, axis=1, inplace=True)
    print(df_filtered.head())

    # Rename columns to appropriate names
    col_names = ["state"]
    for col in df_filtered.columns:
        col_str = str(col)
        if col_str != "state" and not col_str.startswith("Unnamed: "):
            last_name = "".join(col_str.lower().split(" "))
            if last_name not in ["democratic", "republican"]:
                last_name = "thirdparties"
            elif last_name == "democratic":
                last_name = "dem"
            elif last_name == "republican":
                last_name = "rep"
            col_names.append("votes_" + last_name)
        elif col_str != "state":
            col_names.append("pct_" + last_name)
    df_filtered.columns = col_names
    
    # Sum over third party results
    cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
    return cleaned_df


In [148]:
##### LOAD DATA AND PUT IN FULL DATASET #####
# Load the Excel file with multiple sheets
dfs = pd.read_excel(input_file, header=1, sheet_name=None)

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [149]:
df_2012 = dfs["2012"]
df_2012_cleaned = clean_election(df_2012, "2012")

2012
        state  Democratic  Unnamed: 10  Republican  Unnamed: 12  Libertarian  \
0     Alabama    795696.0     0.383590   1255925.0     0.605458      12328.0   
1      Alaska    122640.0     0.408127    164676.0     0.548016       7392.0   
2     Arizona   1025232.0     0.444485   1233654.0     0.534846      32100.0   
3    Arkansas    394409.0     0.368790    647744.0     0.605669      16276.0   
4  California   7854285.0     0.602390   4839958.0     0.371204     143221.0   

   Unnamed: 14    Green  Unnamed: 16  Constitution  ...  We the People  \
0     0.005943   3397.0     0.001638        2981.0  ...            0.0   
1     0.024599   2917.0     0.009707           0.0  ...            0.0   
2     0.013917   7816.0     0.003389         289.0  ...            6.0   
3     0.015219   9305.0     0.008701           0.0  ...            0.0   
4     0.010984  85638.0     0.006568         503.0  ...            6.0   

    Unnamed: 56  Independent.3  Unnamed: 58  NSA Did 911  Unnamed: 60

  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


In [150]:
df_2012_cleaned.head()

Unnamed: 0,pct_dem,pct_rep,pct_thirdparties,state,votes_dem,votes_rep,votes_thirdparties
0,0.38359,0.605458,0.010951,Alabama,795696.0,1255925.0,22717.0
1,0.408127,0.548016,0.043858,Alaska,122640.0,164676.0,13179.0
2,0.444485,0.534846,0.020662,Arizona,1025232.0,1233654.0,47659.0
3,0.36879,0.605669,0.025541,Arkansas,394409.0,647744.0,27315.0
4,0.60239,0.371204,0.026407,California,7854285.0,4839958.0,344304.0


In [151]:
# Process each sheet (year) and combine them into a single DataFrame
i = 0
for key in dfs.keys():
    if key != "Copyright" and i == 0:
        print(f"Processing {key}")
        new_df = clean_election(dfs[key], key)
        new_df["year"] = int(key)
        full_df = new_df
        i += 1
    elif key != "Copyright":
        print(f"Processing {key}")
        new_df = clean_election(dfs[key], key)
        new_df["year"] = int(key)
        full_df = pd.concat([full_df, new_df], verify_integrity=True, axis=0, ignore_index=True)


  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_fi

Processing 2012
2012
        state  Democratic  Unnamed: 10  Republican  Unnamed: 12  Libertarian  \
0     Alabama    795696.0     0.383590   1255925.0     0.605458      12328.0   
1      Alaska    122640.0     0.408127    164676.0     0.548016       7392.0   
2     Arizona   1025232.0     0.444485   1233654.0     0.534846      32100.0   
3    Arkansas    394409.0     0.368790    647744.0     0.605669      16276.0   
4  California   7854285.0     0.602390   4839958.0     0.371204     143221.0   

   Unnamed: 14    Green  Unnamed: 16  Constitution  ...  We the People  \
0     0.005943   3397.0     0.001638        2981.0  ...            0.0   
1     0.024599   2917.0     0.009707           0.0  ...            0.0   
2     0.013917   7816.0     0.003389         289.0  ...            6.0   
3     0.015219   9305.0     0.008701           0.0  ...            0.0   
4     0.010984  85638.0     0.006568         503.0  ...            6.0   

    Unnamed: 56  Independent.3  Unnamed: 58  NSA Did 

  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


Processing 1908
1908
         state  Democratic  Unnamed: 12  Republican  Unnamed: 14  Socialist  \
0      Alabama       74374     0.708783       25372     0.241795       1450   
1     Arkansas       87015     0.573138       56624     0.372963       5842   
2   California      127492     0.329780      214398     0.554578      28659   
3     Colorado      126644     0.479970      123693     0.468786       7960   
4  Connecticut       68255     0.359231      112915     0.594280       5113   

   Unnamed: 16  Prohibition  Unnamed: 18  Independence  ...  Populist  \
0     0.013818          677     0.006452           497  ...      1575   
1     0.038479         1026     0.006758           289  ...      1026   
2     0.074131        11770     0.030445          4278  ...         0   
3     0.030168         5559     0.021068             0  ...         0   
4     0.026910         2380     0.012526           728  ...         0   

   Unnamed: 22  Socialist Labor  Unnamed: 24  United Christian  U

  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_fi

Processing 1876
1876
         state  Democratic  Unnamed: 12  Republican  Unnamed: 14  Greenback  \
0      Alabama    102989.0     0.599823     68708.0     0.400165        0.0   
1     Arkansas     58086.0     0.599158     38649.0     0.398665      211.0   
2   California     76460.0     0.490808     79258.0     0.508769       47.0   
3    Colorado*         0.0          NaN         0.0          NaN        0.0   
4  Connecticut     61927.0     0.507041     59033.0     0.483346      774.0   

   Unnamed: 16  Prohibition  Unnamed: 18  American  Unnamed: 20  Communist  \
0     0.000000          0.0     0.000000       0.0          0.0        0.0   
1     0.002176          0.0     0.000000       0.0          0.0        0.0   
2     0.000302          0.0     0.000000       0.0          0.0        0.0   
3          NaN          0.0          NaN       0.0          NaN        0.0   
4     0.006337        374.0     0.003062       0.0          0.0        0.0   

   Unnamed: 22     -  Unnamed: 24  

  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


In [152]:
##### PREPARE AND OUTPUT FULL DATASET #####
# Remove asterisks from state names
full_df["state"] = full_df["state"].apply(lambda x: str(x)[:-1] if str(x).endswith("*") else str(x)) 

# Only keep rows corresponding to actual U.S. states
full_df = full_df.loc[
    full_df["state"].isin([state.name for state in us.STATES])
].copy()

# Add FIPS codes to the dataset
state_map = us.states.mapping("name", "fips")
full_df["statefips"] = full_df["state"].apply(lambda x: state_map[x])

In [169]:
dataverse = pd.read_csv(os.path.join(raw_data_dir, "dataverse_files/1976-2020-president.csv"))
dataverse = dataverse[dataverse['year'] > 2008]
dataverse = dataverse.drop(columns=['state_po', 'state_cen', 'state_ic', 'office', 'candidate', 'party_detailed', 'writein', 'version', 'notes'])
dataverse['state'] = dataverse['state'].str.title()

dataverse = dataverse.groupby(['year', 'state']).apply(lambda group: pd.Series({
    'votes_dem': group.loc[group['party_simplified'] == 'DEMOCRAT', 'candidatevotes'].sum(),
    'votes_rep': group.loc[group['party_simplified'] == 'REPUBLICAN', 'candidatevotes'].sum(),
    'votes_thirdparties': group.loc[group['party_simplified']== 'OTHER', 'candidatevotes'].sum(),
    'totalvotes': group['totalvotes'].max(),  # Total votes should be the same for all rows in a group
    'statefips': group['state_fips'].max()  # State FIPS should be the same for all rows in a group
})).reset_index()

dataverse['pct_dem'] = dataverse['votes_dem'] / dataverse['totalvotes']
dataverse['pct_rep'] = dataverse['votes_rep'] / dataverse['totalvotes']
dataverse['pct_thirdparties'] = dataverse['votes_thirdparties'] / dataverse['totalvotes']

dataverse['statefips'] = dataverse['statefips'].astype(str).str.zfill(2)

  dataverse = dataverse.groupby(['year', 'state']).apply(lambda group: pd.Series({


In [170]:
dataverse.head()

Unnamed: 0,year,state,votes_dem,votes_rep,votes_thirdparties,totalvotes,statefips,pct_dem,pct_rep,pct_thirdparties
0,2012,Alabama,795696,1255925,22717,2074338,1,0.38359,0.605458,0.010951
1,2012,Alaska,122640,164676,5787,300495,2,0.408127,0.548016,0.019258
2,2012,Arizona,1025232,1233654,8268,2299254,4,0.445898,0.536545,0.003596
3,2012,Arkansas,394409,647744,11039,1069468,5,0.36879,0.605669,0.010322
4,2012,California,7854285,4839958,201083,13038547,6,0.60239,0.371204,0.015422


In [171]:
full_df.head()

Unnamed: 0,pct_dem,pct_rep,pct_thirdparties,state,votes_dem,votes_rep,votes_thirdparties,year,statefips
0,0.263879,0.694998,0.020413,Wyoming,73491.0,193559.0,5685,2020,56
1,0.414088,0.567997,0.004298,Missouri,1253014.0,1718736.0,13007,2020,29
2,0.523951,0.452849,0.012527,Minnesota,1717077.0,1484065.0,41053,2020,27
3,0.506208,0.478373,0.004518,Michigan,2804040.0,2649852.0,25029,2020,26
4,0.65123,0.319082,0.016836,Massachusetts,2382202.0,1167202.0,61588,2020,25


In [172]:
full_df = pd.concat([full_df, dataverse], ignore_index=True)

In [173]:
full_df = full_df.sort_values(by='year', ascending=False)

# Reset the index for clean indexing (optional)
full_df = full_df.reset_index(drop=True)

full_df = full_df.drop(columns=['totalvotes'])

In [174]:
full_df.sample(10)

Unnamed: 0,pct_dem,pct_rep,pct_thirdparties,state,votes_dem,votes_rep,votes_thirdparties,year,statefips
409,0.487067,0.508141,0.004793,Ohio,2741167.0,2859768.0,27012.0,2004,39
1126,0.469172,0.50927,0.021559,Pennsylvania,1752426.0,1902197.0,80525.0,1948,42
2272,0.499948,,0.500052,Maryland,19156.0,,19160.0,1832,24
773,0.552692,0.442194,0.005114,North Carolina,927365.0,741960.0,8581.0,1976,37
1157,0.543758,0.452669,0.003574,Delaware,68166.0,56747.0,448.0,1944,10
1494,0.57169,0.353734,0.074576,Arizona,33170.0,20524.0,4327.0,1916,4
1937,0.444921,0.524104,0.030975,Michigan,141685.0,166901.0,9864.0,1876,26
702,0.438159,0.550997,0.010844,Hawaii,147154.0,185050.0,3642.0,1984,15
1362,0.820999,0.179001,0.0,Mississippi,124539.0,27153.0,0.0,1928,28
944,0.561508,0.438492,0.0,North Carolina,800139.0,624844.0,0.0,1964,37


In [175]:
# Save the cleaned dataset to the output file
full_df.to_csv(output_file, index=False)
print(f"Dataset saved to {output_file}")

Dataset saved to ./2_data/2_intermediate/political_data/./statelevel_preselection_results.csv
