**Script:** clean_presidential_elections.ipynb
**Author:** Revekka Gershovich reworking Michel Gutmann's code
**Purpose:** Original code clean_presidential_elections.py aims to generate a state-election year dataset with presidential election results by party from David Leip's dataset. This code fixes problems with cleaning dataset for years 2008 and 1944, and adds data for 2016 and 2020 from Harvard Dataverse dataset. 
**The additional dataset can be found here:** https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/42MVDX and in '/StateLaws/2_data/1_raw/political_data/dataverse_files' folder on Dropbox

In [200]:
import pandas as pd
import us
import os
import os.path as path
import numpy as np

In [201]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(data_dir), "Data directory does not exist"
input_file = "./2_data/1_raw/political_data/StateLevelData.xlsx"
assert os.path.exists(input_file), "Input file does not exist"
output_file = os.path.join(data_dir, "./statelevel_preselection_results.csv")
assert os.path.exists(output_file), "Output file does not exist"
raw_data_dir = "./2_data/1_raw/political_data"
assert os.path.exists(raw_data_dir), "Raw data directory does not exist"

I will address the issue of democrats not existing until 1828, and republicans not existing until 1854 by coding parties commonly considered as precursors to democrats or republicans as democrats or republicans respectively. 

I am mapping in the next cell the following parties to republicans and democrats: 
Democratic: Democratic_Republicans (1824)
Republican: Federalists (1824), National Republican (Anti_Jacksonian) (1828), Whigs (1832, 1836, 1840, 1844, 1848, 1852), Opposition (Whigs-American + Republican) (1856)

In [202]:
##### DEFINE MAIN CLEANING FUNCTION #####
def clean_election(df, key):
    print(key)
    # Special cases handling

    # In 1824, the Democratic-Republican party was the only party with 4 candidates and I mapped it to democrats; and there was one unpledged republican
    if key == "1824":
        df = df.rename(columns={'Democratic-Republican': 'Democratic', 'Democratic-Republican.1': "Democratic", 'Democratic-Republican.2': "Democratic", 'Democratic-Republican.3': "Democratic", 'Unpledged Rep': "Republican"})
    
    # In 1828 Democratic party was founded so I don't need to handle it but Republican party is not: I coded the Democratic-Republican party as Republican
    if key == "1828":
        df = df.rename(columns={'National Republican':'Republican'})

    # In 1832, 1836, 1840, 1844, 1848, 1852, I mapped the Whig party to Republicans as is the case in the literature
    if key == "1832" or key == "1836" or key == "1840" or key == "1844" or key == "1848" or key == "1852":
        df = df.rename(columns={'Whig': 'Republican'})

    # In 1856, was the time when whigs were almost replaced by republicans and formed a coalition so it is considered the same party, and I coded as such
    if key == "1856":
        df = df.rename(columns={'Whig-American':'Republican'})
        
    if key == "2008":
        # Find the index of "Unnamed: 60"
        cutoff_index = df.columns.get_loc('Unnamed: 60')

        # Keep all columns up to and including "Unnamed: 60"
        df = df.iloc[:, :cutoff_index + 1]
    if key == "1944":
        cutoff_index = df.columns.get_loc('Unnamed: 28')
        df = df.iloc[:, :cutoff_index + 1]
    # Standardize format by keeping only raw result columns
    i = 0
    cols_to_keep = ["Unnamed: 0"]
    for col in df.columns:
        if i == 1:
            cols_to_keep.append(col)
        elif col == "% Total Vote":
            i = 1
    cols_to_keep = cols_to_keep[:-2]
    df_filtered = df[cols_to_keep].copy()
    df_filtered.rename({"Unnamed: 0": "state"}, axis=1, inplace=True)
    # print(df_filtered.head())

    # Rename columns to appropriate names
    col_names = ["state"]
    for col in df_filtered.columns:
        col_str = str(col)
        if col_str != "state" and not col_str.startswith("Unnamed: "):
            last_name = "".join(col_str.lower().split(" "))
            if last_name not in ["democratic", "republican"]:
                last_name = "thirdparties"
            elif last_name == "democratic":
                last_name = "dem"
            elif last_name == "republican":
                last_name = "rep"
            col_names.append("votes_" + last_name)
        elif col_str != "state":
            col_names.append("pct_" + last_name)
    df_filtered.columns = col_names
    
    # Sum over third party results
    cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
    return cleaned_df


In [203]:
##### LOAD DATA AND PUT IN FULL DATASET #####
# Load the Excel file with multiple sheets
dfs = pd.read_excel(input_file, header=1, sheet_name=None)

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [204]:
df_2012 = dfs["2012"]
df_2012_cleaned = clean_election(df_2012, "2012")

2012


  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()


In [205]:
df_2012_cleaned.head()

Unnamed: 0,pct_dem,pct_rep,pct_thirdparties,state,votes_dem,votes_rep,votes_thirdparties
0,0.38359,0.605458,0.010951,Alabama,795696.0,1255925.0,22717.0
1,0.408127,0.548016,0.043858,Alaska,122640.0,164676.0,13179.0
2,0.444485,0.534846,0.020662,Arizona,1025232.0,1233654.0,47659.0
3,0.36879,0.605669,0.025541,Arkansas,394409.0,647744.0,27315.0
4,0.60239,0.371204,0.026407,California,7854285.0,4839958.0,344304.0


In [206]:
# Process each sheet (year) and combine them into a single DataFrame
i = 0
for key in dfs.keys():
    if key != "Copyright" and i == 0:
        print(f"Processing {key}")
        new_df = clean_election(dfs[key], key)
        new_df["year"] = int(key)
        full_df = new_df
        i += 1
    elif key != "Copyright":
        print(f"Processing {key}")
        new_df = clean_election(dfs[key], key)
        new_df["year"] = int(key)
        full_df = pd.concat([full_df, new_df], verify_integrity=True, axis=0, ignore_index=True)


  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_fi

Processing 2012
2012
Processing 2008
2008
Processing 2004
2004
Processing 2000
2000
Processing 1996
1996
Processing 1992
1992
Processing 1988
1988
Processing 1984
1984
Processing 1980
1980
Processing 1976
1976
Processing 1972
1972
Processing 1968
1968
Processing 1964
1964
Processing 1960
1960
Processing 1956
1956
Processing 1952
1952
Processing 1948
1948
Processing 1944
1944
Processing 1940
1940
Processing 1936
1936
Processing 1932
1932
Processing 1928
1928
Processing 1924
1924
Processing 1920
1920
Processing 1916
1916
Processing 1912
1912
Processing 1908
1908
Processing 1904
1904
Processing 1900
1900
Processing 1896
1896
Processing 1892
1892
Processing 1888
1888
Processing 1884
1884
Processing 1880
1880
Processing 1876
1876
Processing 1872
1872
Processing 1868
1868
Processing 1864
1864
Processing 1860
1860
Processing 1856
1856
Processing 1852
1852
Processing 1848
1848
Processing 1844
1844
Processing 1840
1840
Processing 1836
1836
Processing 1832
1832
Processing 1828
1828
Processing 18

  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_filtered.groupby(df_filtered.columns, axis=1).sum()
  cleaned_df = df_fi

In [207]:
##### PREPARE AND OUTPUT FULL DATASET #####
# Remove asterisks from state names
full_df["state"] = full_df["state"].apply(lambda x: str(x)[:-1] if str(x).endswith("*") else str(x)) 

# Only keep rows corresponding to actual U.S. states
full_df = full_df.loc[
    full_df["state"].isin([state.name for state in us.STATES])
].copy()

# Add FIPS codes to the dataset
state_map = us.states.mapping("name", "fips")
full_df["statefips"] = full_df["state"].apply(lambda x: state_map[x])

In [208]:
dataverse = pd.read_csv(os.path.join(raw_data_dir, "dataverse_files/1976-2020-president.csv"))
dataverse = dataverse[dataverse['year'] > 2008]
dataverse = dataverse.drop(columns=['state_po', 'state_cen', 'state_ic', 'office', 'candidate', 'party_detailed', 'writein', 'version', 'notes'])
dataverse['state'] = dataverse['state'].str.title()

dataverse = dataverse.groupby(['year', 'state']).apply(lambda group: pd.Series({
    'votes_dem': group.loc[group['party_simplified'] == 'DEMOCRAT', 'candidatevotes'].sum(),
    'votes_rep': group.loc[group['party_simplified'] == 'REPUBLICAN', 'candidatevotes'].sum(),
    'votes_thirdparties': group.loc[group['party_simplified']== 'OTHER', 'candidatevotes'].sum(),
    'totalvotes': group['totalvotes'].max(),  # Total votes should be the same for all rows in a group
    'statefips': group['state_fips'].max()  # State FIPS should be the same for all rows in a group
})).reset_index()

dataverse['pct_dem'] = dataverse['votes_dem'] / dataverse['totalvotes']
dataverse['pct_rep'] = dataverse['votes_rep'] / dataverse['totalvotes']
dataverse['pct_thirdparties'] = dataverse['votes_thirdparties'] / dataverse['totalvotes']

dataverse['statefips'] = dataverse['statefips'].astype(str).str.zfill(2)

  dataverse = dataverse.groupby(['year', 'state']).apply(lambda group: pd.Series({


In [209]:
dataverse.head()

Unnamed: 0,year,state,votes_dem,votes_rep,votes_thirdparties,totalvotes,statefips,pct_dem,pct_rep,pct_thirdparties
0,2012,Alabama,795696,1255925,22717,2074338,1,0.38359,0.605458,0.010951
1,2012,Alaska,122640,164676,5787,300495,2,0.408127,0.548016,0.019258
2,2012,Arizona,1025232,1233654,8268,2299254,4,0.445898,0.536545,0.003596
3,2012,Arkansas,394409,647744,11039,1069468,5,0.36879,0.605669,0.010322
4,2012,California,7854285,4839958,201083,13038547,6,0.60239,0.371204,0.015422


In [210]:
full_df.head()

Unnamed: 0,pct_dem,pct_rep,pct_thirdparties,state,votes_dem,votes_rep,votes_thirdparties,year,statefips
0,0.38359,0.605458,0.010951,Alabama,795696.0,1255925.0,22717.0,2012,1
1,0.408127,0.548016,0.043858,Alaska,122640.0,164676.0,13179.0,2012,2
2,0.444485,0.534846,0.020662,Arizona,1025232.0,1233654.0,47659.0,2012,4
3,0.36879,0.605669,0.025541,Arkansas,394409.0,647744.0,27315.0,2012,5
4,0.60239,0.371204,0.026407,California,7854285.0,4839958.0,344304.0,2012,6


In [211]:
full_df = pd.concat([full_df, dataverse], ignore_index=True)

In [212]:
full_df = full_df.sort_values(by='year', ascending=False)

# Reset the index for clean indexing (optional)
full_df = full_df.reset_index(drop=True)

full_df = full_df.drop(columns=['totalvotes'])

In [213]:
full_df.sample(10)

Unnamed: 0,pct_dem,pct_rep,pct_thirdparties,state,votes_dem,votes_rep,votes_thirdparties,year,statefips
1202,0.4604,0.537558,0.002041,Tennessee,167343.0,195388.0,742.0,1928,47
411,0.389118,0.376914,0.233968,New Hampshire,209040.0,202484.0,125691.0,1992,33
825,0.527329,0.472671,0.0,West Virginia,441786.0,395995.0,0.0,1960,54
684,0.260411,0.642389,0.097201,Idaho,80826.0,199384.0,30169.0,1972,16
1935,0.294229,0.622396,0.083375,Maine,29693.0,62811.0,8414.0,1860,23
1180,0.544687,0.444666,0.010647,West Virginia,405124.0,330731.0,7919.0,1932,54
831,0.46215,0.53785,0.0,Idaho,138853.0,161597.0,0.0,1960,16
937,0.344208,0.654188,0.001604,Idaho,95081.0,180707.0,443.0,1952,16
2018,0.505571,0.494364,6.5e-05,Alabama,31173.0,30482.0,4.0,1848,1
650,0.371185,0.598803,0.030012,Idaho,126549.0,204151.0,10232.0,1976,16


In [214]:
# Save the cleaned dataset to the output file
full_df.to_csv(output_file, index=False)
print(f"Dataset saved to {output_file}")

Dataset saved to ./2_data/2_intermediate/political_data/./statelevel_preselection_results.csv
