Script: clean_presidential_elections.ipynb
Author: Michel Gutmann
Purpose: Generates a state-election year dataset with presidential election results by party from David Leip's dataset.

In [6]:
import pandas as pd
import us
import os
import os.path as path
import numpy as np

In [7]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(data_dir), "Data directory does not exist"

In [8]:
input_file = "./2_data/1_raw/political_data/StateLevelData.xlsx"
assert os.path.exists(input_file), "Input file does not exist"
output_file = os.path.join(data_dir, "./statelevel_preselection_results.csv")
assert os.path.exists(output_file), "Output file does not exist"



In [94]:
##### DEFINE MAIN CLEANING FUNCTION #####
def clean_election(df):
    # Standardize format by keeping only raw result columns
    i = 0
    cols_to_keep = ["Unnamed: 0"]
    for col in df.columns:
        if i == 1:
            cols_to_keep.append(col)
        elif col == "% Total Vote":
            i = 1
    cols_to_keep = cols_to_keep[:-2]
    df_filtered = df[cols_to_keep].copy()
    df_filtered.rename({"Unnamed: 0": "state"}, axis=1, inplace=True)

    # Rename columns to appropriate names
    col_names = ["state"]
    for col in df_filtered.columns:
        col_str = str(col)
        if col_str != "state" and not col_str.startswith("Unnamed: "):
            last_name = "".join(col_str.lower().split(" "))
            if last_name not in ["democratic", "republican"]:
                last_name = "thirdparties"
            elif last_name == "democratic":
                last_name = "dem"
            elif last_name == "republican":
                last_name = "rep"
            col_names.append("votes_" + last_name)
        elif col_str != "state":
            col_names.append("pct_" + last_name)
    df_filtered.columns = col_names
    # Keep only float type columns and 'state' column
    df_filtered = df_filtered.loc[:, (df_filtered.dtypes == 'float64') | (df_filtered.columns == 'state')]
    with pd.option_context('display.max_rows', None):
        print(df_filtered.dtypes)
    
    # Sum over third party results
    cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
    return cleaned_df


In [92]:
##### LOAD DATA AND PUT IN FULL DATASET #####
# Load the Excel file with multiple sheets
dfs = pd.read_excel(input_file, header=1, sheet_name=None)

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [95]:
df_2008 = dfs["2008"]
df_2008_clean = clean_election(df_2008)

state                  object
votes_dem             float64
pct_dem               float64
votes_rep             float64
pct_rep               float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thir

  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


In [82]:
df_2012 = dfs["2012"]
df_2012_clean = clean_election(df_2012)

state                  object
votes_dem             float64
pct_dem               float64
votes_rep             float64
pct_rep               float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thirdparties    float64
pct_thirdparties      float64
votes_thir

  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


In [20]:
# Process each sheet (year) and combine them into a single DataFrame
i = 0
for key in dfs.keys():
    if key != "Copyright" and i == 0:
        print(f"Processing {key}")
        new_df = clean_election(dfs[key])
        new_df["year"] = int(key)
        full_df = new_df
        i += 1
    elif key != "Copyright":
        print(f"Processing {key}")
        new_df = clean_election(dfs[key])
        new_df["year"] = int(key)
        full_df = pd.concat([full_df, new_df], verify_integrity=True, axis=0, ignore_index=True)


Processing 2012
Processing 2008


  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [None]:
##### PREPARE AND OUTPUT FULL DATASET #####
# Remove asterisks from state names
full_df["state"] = full_df["state"].apply(lambda x: str(x)[:-1] if str(x).endswith("*") else str(x)) 

# Only keep rows corresponding to actual U.S. states
full_df = full_df.loc[
    full_df["state"].isin([state.name for state in us.STATES])
].copy()

# Add FIPS codes to the dataset
state_map = us.states.mapping("name", "fips")
full_df["statefips"] = full_df["state"].apply(lambda x: state_map[x])

# Save the cleaned dataset to the output file
# full_df.to_csv(output_file, index=False)
print(f"Dataset saved to {output_file}")