Script: clean_presidential_elections.ipynb
Author: Michel Gutmann
Purpose: Generates a state-election year dataset with presidential election results by party from David Leip's dataset.

In [5]:
import pandas as pd
import us
import os
import os.path as path
import numpy as np

In [6]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(data_dir), "Data directory does not exist"

In [7]:
input_file = "./2_data/1_raw/political_data/StateLevelData.xlsx"
assert os.path.exists(input_file), "Input file does not exist"
output_file = os.path.join(data_dir, "./statelevel_preselection_results.csv")
assert os.path.exists(output_file), "Output file does not exist"



In [118]:
##### DEFINE MAIN CLEANING FUNCTION #####
def clean_election(df, key):
    print(key)
    if key == "2008":
        df = df.drop(['State', 'Unnamed: 63', 'EV', 'Meth', 'Blanks', 'State Code'], axis=1)
        # df = df.drop(['State', 'Unnamed: 63', 'Blanks'], axis=1)
    if key == "1944":
        df = df.drop(['State', 'Unnamed: 51'], axis=1)
    # Standardize format by keeping only raw result columns
    i = 0
    cols_to_keep = ["Unnamed: 0"]
    for col in df.columns:
        if i == 1:
            cols_to_keep.append(col)
        elif col == "% Total Vote":
            i = 1
    cols_to_keep = cols_to_keep[:-2]
    df_filtered = df[cols_to_keep].copy()
    df_filtered.rename({"Unnamed: 0": "state"}, axis=1, inplace=True)
    print(df_filtered.head())

    # Rename columns to appropriate names
    col_names = ["state"]
    for col in df_filtered.columns:
        col_str = str(col)
        if col_str != "state" and not col_str.startswith("Unnamed: "):
            last_name = "".join(col_str.lower().split(" "))
            if last_name not in ["democratic", "republican"]:
                last_name = "thirdparties"
            elif last_name == "democratic":
                last_name = "dem"
            elif last_name == "republican":
                last_name = "rep"
            col_names.append("votes_" + last_name)
        elif col_str != "state":
            col_names.append("pct_" + last_name)
    df_filtered.columns = col_names
    
    # Sum over third party results
    cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
    return cleaned_df


In [109]:
##### LOAD DATA AND PUT IN FULL DATASET #####
# Load the Excel file with multiple sheets
dfs = pd.read_excel(input_file, header=1, sheet_name=None)

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [108]:
df_1944 = dfs["1944"]
df_1944 = clean_election(df_1944, "1944")

1944
Index(['state', 'Democratic', 'Unnamed: 12', 'Republican', 'Unnamed: 14',
       'Texas Regulars', 'Unnamed: 16', 'Socialist', 'Unnamed: 18',
       'Prohibition', 'Unnamed: 20', 'Socialist Labor', 'Unnamed: 22',
       'Southern Democrat', 'Unnamed: 24', '-', 'Unnamed: 26', 'Democratic.1',
       'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31',
       'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35',
       'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39',
       'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43',
       'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47',
       'Unnamed: 48', 'Unnamed: 49', 'EV', 'Unnamed: 53', 'Unnamed: 54',
       'Unnamed: 55', 'Unnamed: 56'],
      dtype='object')


  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


In [123]:
df_2008 = dfs["2008"]
print(df_2008['Unnamed: 61'].head())
if df_2008['Unnamed: 61'].notna().any():
    print("The column contains values other than NA.")
else:
    print("The column contains only NA values.")
print(df_2008.columns)

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: Unnamed: 61, dtype: float64
The column contains only NA values.
Index(['Unnamed: 0', 'Elec Vote', 'Popular Vote', 'D', 'R', 'O', 'D.1', 'R.1',
       'I', 'Votes', '% Total Vote', 'Democratic', 'Unnamed: 12', 'Republican',
       'Unnamed: 14', 'Independent', 'Unnamed: 16', 'Libertarian',
       'Unnamed: 18', 'Green', 'Unnamed: 20', 'Constitution', 'Unnamed: 22',
       'Socialist Workers', 'Unnamed: 24', 'Socialist', 'Unnamed: 26',
       'Socialism and Liberation', 'Unnamed: 28', '-', 'Unnamed: 30',
       'Boston Tea Party', 'Unnamed: 32', 'Independent.1', 'Unnamed: 34',
       '-.1', 'Unnamed: 36', 'Independent.2', 'Unnamed: 38', 'New',
       'Unnamed: 40', 'US Pacifist', 'Unnamed: 42', 'New American Indpendent',
       'Unnamed: 44', 'Prohibition', 'Unnamed: 46', 'Objectivist',
       'Unnamed: 48', 'LA Taxpayers', 'Unnamed: 50', 'Reform', 'Unnamed: 52',
       'Libertarian.1', 'Unnamed: 54', 'Vote Here', 'Unnamed: 56',
       'Indep

In [119]:
df_2008 = dfs["2008"]
# df_2008 = df_2008.loc[:, ['State', 'Unnamed: 63']]
# print(df_2008.head())
# df_2008 = df_2008.drop(['State', 'Unnamed: 63'], axis=1)
df_2008 = clean_election(df_2008, "2008")

2008
        state  Democratic  Unnamed: 12  Republican  Unnamed: 14  Independent  \
0     Alabama    813479.0     0.387404   1266546.0     0.603169       6788.0   
1      Alaska    123594.0     0.378894    193841.0     0.594245       3783.0   
2     Arizona   1034707.0     0.449123   1230111.0     0.533940      11301.0   
3    Arkansas    422310.0     0.388647    638017.0     0.587159      12882.0   
4  California   8274473.0     0.609436   5011781.0     0.369130     108381.0   

   Unnamed: 16  Libertarian  Unnamed: 18    Green  ...  Unnamed: 58  \
0     0.003233       4991.0     0.002377      2.0  ...          0.0   
1     0.011597       1589.0     0.004871      0.0  ...          0.0   
2     0.004905      12555.0     0.005450   3406.0  ...          0.0   
3     0.011855       4776.0     0.004395   3470.0  ...          0.0   
4     0.007983      67582.0     0.004978  38774.0  ...          0.0   

   We, the People  Unnamed: 60  Unnamed: 61  Unnamed: 65  Unnamed: 66  \
0             

  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


In [104]:
df_2012 = dfs["2012"]
df_2012 = clean_election(df_2012, "2012")

2012
Index(['state', 'Democratic', 'Unnamed: 10', 'Republican', 'Unnamed: 12',
       'Libertarian', 'Unnamed: 14', 'Green', 'Unnamed: 16', 'Constitution',
       'Unnamed: 18', 'Socialist Workers', 'Unnamed: 20', 'Socialist',
       'Unnamed: 22', 'Socialism and Liberation', 'Unnamed: 24', '-',
       'Unnamed: 26', 'Justice', 'Unnamed: 28', 'American Ind.', 'Unnamed: 30',
       'Peace & Freedom', 'Unnamed: 32', '-.1', 'Unnamed: 34', 'Grassroots',
       'Unnamed: 36', 'Const. Government', 'Unnamed: 38',
       'American Third Position', 'Unnamed: 40', 'Prohibition', 'Unnamed: 42',
       'Objectivist', 'Unnamed: 44', 'Socialist Equality', 'Unnamed: 46',
       'Reform', 'Unnamed: 48', 'Independent', 'Unnamed: 50', 'Independent.1',
       'Unnamed: 52', 'Independent.2', 'Unnamed: 54', 'We the People',
       'Unnamed: 56', 'Independent.3', 'Unnamed: 58', 'NSA Did 911',
       'Unnamed: 60', 'Reform.1', 'Unnamed: 62', 'Reform.2', 'Unnamed: 64'],
      dtype='object')


  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


In [117]:
df_2008.head()

Unnamed: 0,pct_dem,pct_rep,pct_thirdparties,state,votes_dem,votes_rep,votes_thirdparties
0,0.387404,0.603169,34.009427,Alabama,813479.0,1266546.0,19794.0
1,0.378894,0.594245,37.026861,Alaska,123594.0,193841.0,8762.0
2,0.449123,0.53394,35.016937,Arizona,1034707.0,1230111.0,39020.0
3,0.388647,0.587159,28.024194,Arkansas,422310.0,638017.0,26290.0
4,0.609436,0.36913,27.021434,California,8274473.0,5011781.0,291011.0


In [88]:
df_1944 = dfs["1944"]
df_1944 = clean_election(df_1944, "1944")

1944


  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


In [89]:
print(df_1944.head())

    pct_dem   pct_rep  pct_thirdparties       state  votes_dem  votes_rep  \
0  0.812763  0.181987         25.005250     Alabama     198918      44540   
1  0.587980  0.408961         24.003059     Arizona      80926      56287   
2  0.699517  0.298426         24.002057    Arkansas     148965      63551   
3  0.564793  0.429713         23.005495  California    1988564    1512965   
4  0.463986  0.532100         24.003915    Colorado     234331     268731   

   votes_thirdparties  
0                1296  
1                 425  
2                 447  
3               19371  
4                1983  


In [36]:
print(19804/(813479+1266546))

0.009521039410584008


In [69]:
df_2012 = dfs["2012"]
df_2012_clean = clean_election(dfs["2012"], 2012)

2012


  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


In [34]:
df_2012_clean.head()

Unnamed: 0,pct_dem,pct_rep,pct_thirdparties,state,votes_dem,votes_rep,votes_thirdparties
0,0.38359,0.605458,0.010951,Alabama,795696.0,1255925.0,22717.0
1,0.408127,0.548016,0.043858,Alaska,122640.0,164676.0,13179.0
2,0.444485,0.534846,0.020662,Arizona,1025232.0,1233654.0,47659.0
3,0.36879,0.605669,0.025541,Arkansas,394409.0,647744.0,27315.0
4,0.60239,0.371204,0.026407,California,7854285.0,4839958.0,344304.0


In [84]:
# Process each sheet (year) and combine them into a single DataFrame
i = 0
for key in dfs.keys():
    if key != "Copyright" and i == 0:
        print(f"Processing {key}")
        new_df = clean_election(dfs[key], key)
        new_df["year"] = int(key)
        full_df = new_df
        i += 1
    elif key != "Copyright":
        print(f"Processing {key}")
        new_df = clean_election(dfs[key], key)
        new_df["year"] = int(key)
        full_df = pd.concat([full_df, new_df], verify_integrity=True, axis=0, ignore_index=True)


Processing 2012
2012
Processing 2008
2008
Processing 2004
2004
Processing 2000
2000
Processing 1996
1996
Processing 1992
1992
Processing 1988
1988
Processing 1984
1984
Processing 1980
1980
Processing 1976
1976
Processing 1972
1972
Processing 1968
1968
Processing 1964
1964
Processing 1960
1960
Processing 1956
1956
Processing 1952
1952
Processing 1948
1948
Processing 1944
1944
Processing 1940
1940
Processing 1936
1936
Processing 1932
1932
Processing 1928
1928
Processing 1924
1924
Processing 1920
1920
Processing 1916
1916
Processing 1912
1912
Processing 1908
1908
Processing 1904
1904
Processing 1900
1900
Processing 1896
1896
Processing 1892
1892
Processing 1888
1888
Processing 1884
1884
Processing 1880
1880
Processing 1876
1876
Processing 1872
1872
Processing 1868
1868
Processing 1864
1864
Processing 1860
1860
Processing 1856
1856
Processing 1852
1852
Processing 1848
1848
Processing 1844
1844
Processing 1840
1840
Processing 1836
1836
Processing 1832
1832
Processing 1828
1828
Processing 18

  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_fi

In [13]:
##### PREPARE AND OUTPUT FULL DATASET #####
# Remove asterisks from state names
full_df["state"] = full_df["state"].apply(lambda x: str(x)[:-1] if str(x).endswith("*") else str(x)) 

# Only keep rows corresponding to actual U.S. states
full_df = full_df.loc[
    full_df["state"].isin([state.name for state in us.STATES])
].copy()

# Add FIPS codes to the dataset
state_map = us.states.mapping("name", "fips")
full_df["statefips"] = full_df["state"].apply(lambda x: state_map[x])

# Save the cleaned dataset to the output file
# full_df.to_csv(output_file, index=False)
print(f"Dataset saved to {output_file}")

Dataset saved to ./2_data/2_intermediate/political_data/./statelevel_preselection_results.csv
