**Created by:** Revekka Gershovich 
**When:** Nov 20, 2024 
**Purpose:** To explore the missing data patterns in statelevel_preselections_results.csv produced by clean_presidential_elections.py script

In [79]:
import os
import os.path as path
import pandas as pd
import numpy as np

In [80]:
# parent_dir = os.path.abspath("/Users/rivka666/Dropbox (MIT)/StateLaws")
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(data_dir), "Data directory does not exist"

In [81]:
df = pd.read_csv(os.path.join(data_dir, "statelevel_preselection_results.csv"))

In [82]:
all_even_years = set(range(1824, 2013, 4))
unique_years = set(df["year"].unique())
missing_years = sorted(all_even_years - unique_years)
print("Missing years:", missing_years)

Missing years: []


In [83]:
# Check for missing years because Nicolas said that 2008 is missing, 
# and that data stops in 2012
print(df["year"].unique())

[2012 2008 2004 2000 1996 1992 1988 1984 1980 1976 1972 1968 1964 1960
 1956 1952 1948 1944 1940 1936 1932 1928 1924 1920 1916 1912 1908 1904
 1900 1896 1892 1888 1884 1880 1876 1872 1868 1864 1860 1856 1852 1848
 1844 1840 1836 1832 1828 1824]


In [84]:
columns_order = ["year", "state"] + [col for col in df.columns if col not in ["year", "state"]]
df = df[columns_order]

In [85]:
df.columns

Index(['year', 'state', 'pct_dem', 'pct_rep', 'pct_thirdparties', 'votes_dem',
       'votes_rep', 'votes_thirdparties', 'statefips'],
      dtype='object')

In [86]:
dataverse = pd.read_csv(os.path.join(raw_data_dir, "dataverse_files/1976-2020-president.csv"))

In [87]:
# Drop rows where the 'year' column is greater than 2008
dataverse = dataverse[dataverse['year'] > 2008]

In [88]:
print(dataverse['year'].unique())
print(dataverse['office'].unique())
print(dataverse['writein'].unique())

[2012 2016 2020]
['US PRESIDENT']
[False True nan]


In [89]:
dataverse = dataverse.drop(columns=['state_po', 'state_cen', 'state_ic', 'office', 'candidate', 'party_detailed', 'writein', 'version', 'notes'])

In [90]:
dataverse['state'] = dataverse['state'].str.title()

In [91]:
dataverse.head()

Unnamed: 0,year,state,state_fips,candidatevotes,totalvotes,party_simplified
3079,2012,Alabama,1,1255925,2074338,REPUBLICAN
3080,2012,Alabama,1,795696,2074338,DEMOCRAT
3081,2012,Alabama,1,18706,2074338,OTHER
3082,2012,Alabama,1,4011,2074338,OTHER
3083,2012,Alaska,2,164676,300495,REPUBLICAN


In [92]:
dataverse = dataverse.groupby(['year', 'state']).apply(lambda group: pd.Series({
    'votes_dem': group.loc[group['party_simplified'] == 'DEMOCRAT', 'candidatevotes'].sum(),
    'votes_rep': group.loc[group['party_simplified'] == 'REPUBLICAN', 'candidatevotes'].sum(),
    'votes_thirdparties': group.loc[group['party_simplified']== 'OTHER', 'candidatevotes'].sum(),
    'totalvotes': group['totalvotes'].max(),  # Total votes should be the same for all rows in a group
    'statefips': group['state_fips'].max()  # State FIPS should be the same for all rows in a group
})).reset_index()

  dataverse = dataverse.groupby(['year', 'state']).apply(lambda group: pd.Series({


In [93]:
dataverse['pct_dem'] = dataverse['votes_dem'] / dataverse['totalvotes']
dataverse['pct_rep'] = dataverse['votes_rep'] / dataverse['totalvotes']
dataverse['pct_thirdparties'] = dataverse['votes_thirdparties'] / dataverse['totalvotes']

In [94]:
dataverse.head()

Unnamed: 0,year,state,votes_dem,votes_rep,votes_thirdparties,totalvotes,statefips,pct_dem,pct_rep,pct_thirdparties
0,2012,Alabama,795696,1255925,22717,2074338,1,0.38359,0.605458,0.010951
1,2012,Alaska,122640,164676,5787,300495,2,0.408127,0.548016,0.019258
2,2012,Arizona,1025232,1233654,8268,2299254,4,0.445898,0.536545,0.003596
3,2012,Arkansas,394409,647744,11039,1069468,5,0.36879,0.605669,0.010322
4,2012,California,7854285,4839958,201083,13038547,6,0.60239,0.371204,0.015422


In [95]:
df.head()

Unnamed: 0,year,state,pct_dem,pct_rep,pct_thirdparties,votes_dem,votes_rep,votes_thirdparties,statefips
0,2012,Alabama,0.38359,0.605458,0.010951,795696.0,1255925.0,22717.0,1
1,2012,Alaska,0.408127,0.548016,0.043858,122640.0,164676.0,13179.0,2
2,2012,Arizona,0.444485,0.534846,0.020662,1025232.0,1233654.0,47659.0,4
3,2012,Arkansas,0.36879,0.605669,0.025541,394409.0,647744.0,27315.0,5
4,2012,California,0.60239,0.371204,0.026407,7854285.0,4839958.0,344304.0,6


In [96]:
dataverse = dataverse[dataverse['year'] > 2012]

In [97]:
df = pd.concat([df, dataverse], ignore_index=True)

In [101]:
df = df.sort_values(by='year', ascending=False)

# Reset the index for clean indexing (optional)
df = df.reset_index(drop=True)

df = df.drop(columns=['totalvotes'])

In [102]:
df.sample(10)

Unnamed: 0,year,state,pct_dem,pct_rep,pct_thirdparties,votes_dem,votes_rep,votes_thirdparties,statefips
1807,1868,Texas,0.0,0.0,0.0,0.0,0.0,0.0,48
38,2020,Indiana,0.409617,0.570212,0.000643,1242416.0,1729519.0,1951.0,18
510,1980,Alaska,0.264079,0.543482,0.192439,41842.0,86112.0,30491.0,2
371,1992,Missouri,0.440717,0.339217,0.220066,1053873.0,811159.0,526238.0,29
1690,1880,Minnesota,0.353618,0.622816,0.023566,53315.0,93902.0,3553.0,27
2123,1824,Maine,,,1.0,,,12625.0,23
384,1992,Alabama,0.408801,0.476454,0.114745,690080.0,804283.0,193697.0,1
913,1948,New Jersey,0.459313,0.503255,0.037432,895455.0,981124.0,72976.0,34
66,2016,Ohio,0.435581,0.516877,0.047543,2394164.0,2841005.0,261318.0,39
804,1956,New Hampshire,0.33845,0.661135,0.000416,90364.0,176519.0,111.0,33


In [103]:
df.head()

Unnamed: 0,year,state,pct_dem,pct_rep,pct_thirdparties,votes_dem,votes_rep,votes_thirdparties,statefips
0,2020,Wyoming,0.263879,0.694998,0.020413,73491.0,193559.0,5685.0,56
1,2020,Alabama,0.3657,0.620316,0.003147,849624.0,1441170.0,7312.0,1
2,2020,Rhode Island,0.594762,0.386704,0.00876,307486.0,199922.0,4529.0,44
3,2020,Nebraska,0.391666,0.582242,0.004884,374583.0,556846.0,4671.0,31
4,2020,Wisconsin,0.494495,0.488224,0.01728,1630866.0,1610184.0,56991.0,55
