In [None]:
import os
import os.path as path
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import glob

In [None]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
intermed_data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(intermed_data_dir), "Data directory does not exist"
raw_data_dir = "./2_data/1_raw/political_data/governor_returns"
assert os.path.exists(raw_data_dir), "Data directory does not exist"

In [None]:
# Find all files matching the pattern
files = sorted(glob.glob(os.path.join(raw_data_dir, "export*.csv")))

# Load and concatenate them
df = pd.concat((pd.read_csv(f, skiprows=2) for f in files), ignore_index=True)

In [None]:
# Define the list of Republican-aligned third parties
republican_third_parties = [
    'Republican, Prohibition, and Progressive', 'Lily-White Republican',
    'Republican-Greenback-Fusion', 'Republican and Temperance',
    'Conservative Republican', 'Conservative', 'Union',
    'Whig', 'Opposition', 'Distributionist Candidate','American', 
    'Fremont American', 'Fusion', 'American and Republican', 
    'Know-Nothing', 'Whig Anti-Rent', 'Liberation Whig', 
    'Law and Order', 'Independent Whig', 'Anti-Masonic', 
    'State Rights', 'Constitution', 'Anti-Mason',
    'National Republican', 'Anti-Jackson', 'Clay Republican',
    'Democrat-Republican', 'Adams Republican', 'Federalist',
    'Clinton Republican', 'Republican-Federalist Fusion', 'Old Republican'
]

# Define the list of Democratic-aligned third parties
democratic_third_parties = [
    'Democrat', 'Independent Democrat', 'Democrat-Labor-Peoples',
    'Democrat-Fusion', 'Democrat-Peoples',
    "People's Party", 'Populist', 'Liberal', 
    'Democrat-National Green Labor', 'Low Tax Democrat',
    'State Credit Democrat', 'Butler Democrat and Greenback', 'Douglas Democrat',
    'Anti-Redemption Democrat', 'Jackson',
    'Democrat Farmer-Labor', 'Farmer-Labor'
]

In [None]:
df = df[df['Office'] == "Governor"]

# Dictionary mapping state names to abbreviations
state_to_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
    'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA',
    'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',
    'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 
    'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'}

# Add a new column to the DataFrame with the abbreviations
df.loc[:, 'state_abbrev'] = df['Area'].map(state_to_abbrev)

# Keep only the needed columns and rename for clarity
df = df.drop(columns= ['Area', 'CensusPop', 'Office', 'RaceTypeName', 'RepStatus', 'RepCandidate', 
'DemStatus', 'DemCandidate', 'ThirdStatus', 'ThirdCandidate', 'RaceNotes'])

df.rename(columns={'raceYear': 'year', 'RepVotes': 'republican_votes', 'DemVotes': 'democratic_votes',
                     'ThirdVotes': 'third_party_votes', 'OtherVotes': 'other_votes', 
                     'ThirdParty': 'third_party_name', 'PluralityParty': 'plurality_party_abbrev',
                    'PluralityVotes': 'plurality_votes', 'ThirdVotesTotalPercent': 'third_party_total_vote_share',
                    'RepVotesMajorPercent': 'republican_major_vote_share', 'DemVotesMajorPercent': 'democratic_major_vote_share'}, inplace=True)

df['year'] = df['year'].astype(str).str[:4]  # Convert to string & keep only first 4 digits
df['year'] = df['year'].astype(int)  # Convert back to integer

cols_to_convert = ['republican_votes', 'democratic_votes', 'third_party_votes', 'other_votes', 'plurality_votes', 
                     'republican_major_vote_share', 'democratic_major_vote_share', 'third_party_total_vote_share']

for col in cols_to_convert:
    df[col] = df[col].str.replace(r'[^\d.-]', '', regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')

df['total_votes'] = df[['republican_votes', 'democratic_votes', 'third_party_votes', 'other_votes']].sum(axis=1, skipna=True)
df['republican_total_vote_share'] = ((df['republican_votes'] / df['total_votes'])*100).round(2)
df['democratic_total_vote_share'] = ((df['democratic_votes'] / df['total_votes'])*100).round(2)

In [None]:
# Identify rows where republican_votes is NaN and ThirdParty belongs to the Republican-aligned list
mask = df['republican_votes'].isna() & df['third_party_name'].isin(republican_third_parties)

# Copy values from third_party_votes and ThirdCandidate into republican_votes and RepCandidate
df.loc[mask, 'republican_votes'] = df.loc[mask, 'third_party_votes']
# df.loc[mask, 'RepCandidate'] = df.loc[mask, 'ThirdCandidate']

# Remove values from third_party_votes and ThirdCandidate by setting them to NaN
df.loc[mask, ['third_party_votes', 'third_party_total_vote_share']] = np.nan

# Identify rows where democratic_votes is NaN and ThirdParty belongs to the Democratic-aligned list
mask = df['democratic_votes'].isna() & df['third_party_name'].isin(democratic_third_parties)

# Copy values from third_party_votes and ThirdCandidate into democratic_votes and DemCandidate
df.loc[mask, 'democratic_votes'] = df.loc[mask, 'third_party_votes']
# df.loc[mask, 'DemCandidate'] = df.loc[mask, 'ThirdCandidate']

# Remove values from third_party_votes and ThirdCandidate by setting them to NaN
df.loc[mask, ['third_party_votes', 'third_party_total_vote_share']] = np.nan

# Drop rows where republican_votes and democratic_votes are both NaN (4.43% of the data)
df = df[~((df["republican_votes"].isna() | df["democratic_votes"].isna()) & (df["third_party_total_vote_share"] >= 10))]

In [None]:
df_missing_votes = df[df["republican_votes"].isna() | df["democratic_votes"].isna()]
df_major_third_party = df[df['third_party_total_vote_share'] >= 10]
df_missing_votes_major_third_party = df_missing_votes[df_missing_votes['third_party_total_vote_share'] >= 10]

print(f"Number of rows in the dataset: {df.shape[0]}")
print(f"Number of rows with missing votes: {df_missing_votes.shape[0]}")
print(f"Number of rows with major third party: {df_major_third_party.shape[0]}")
print(f"Number of rows with missing votes and major third party: {df_missing_votes_major_third_party.shape[0]}")

print(f"Percent of rows with missing votes for reps or dems and a major third party: {df_missing_votes_major_third_party.shape[0] / df.shape[0] * 100:.2f}%")

In [None]:
# random state 9 selects 19th century
df.sample(5, random_state=15)