**Author:** Revekka Gershovich 
**Date:** Dec 20, 2024
**Goal:** Merging Final Dataset with Governor Dataset
**Data Provenance:** https://github.com/jacobkap/governors/blob/master/README.md 

In [None]:
import os
import os.path as path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import re

In [None]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
intermed_data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(intermed_data_dir), "Data directory does not exist"
raw_data_dir = "./2_data/1_raw/political_data/all_partisanComposition"
assert os.path.exists(raw_data_dir), "Data directory does not exist"

In [None]:
state_to_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
    'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA',
    'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',
    'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 
    'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'}


In [None]:
gov_data = pd.read_csv(os.path.join(raw_data_dir, "Governors_data", "united_states_governors_1775_2020.csv"))

# Add a new column to the DataFrame with the abbreviations
gov_data.loc[:, 'state_abbrev'] = gov_data['state'].map(state_to_abbrev)

# Drop rows where 'state_abbrev' is NA
gov_data = gov_data[gov_data['state_abbrev'].notna()]

gov_data = gov_data.drop(columns=['time_in_office', 'state'])

gov_data = gov_data[gov_data['year'] >= 1834]


In [None]:
gov_data.head()

In [None]:
# Remove parentheses and everything inside them
gov_data['party'] = gov_data['party'].str.replace(r"\s*\(.*?\)", "", regex=True)

# Remove everything after the first ';' in the 'party' column
gov_data['party'] = gov_data['party'].str.split(';').str[0]

# Convert all values in 'party' column to lowercase
gov_data['party'] = gov_data['party'].str.lower()

# Define the mapping for clear affiliations
party_mapping = {
    'democrat': 'democrat',
    'democratic': 'democrat',
    'jacksonian democrat': 'democrat',
    'jeffersonian republican': 'democrat',
    'democratic-republican': 'democrat',
    'silver-democratic': 'democrat',
    'democrat, prohibition': 'democrat',
    'democrat, national': 'democrat',
    '1st term democrat': 'democrat',
    'union-democratic': 'democrat',
    'republican': 'republican',
    'independent-republican': 'republican',
    'liberal republican': 'republican',
    'union republican': 'republican',
    'republican organizing committee': 'republican',
    'republic': 'republican',
    'republician': 'republican',
    'military, republican': 'republican',
    'conservative republican': 'republican',
    'domecratic-farmer-labor': 'democrat',
    'jacksonian, democrat': 'democrat',
    'whig, republican': 'republican',
    'whig': 'republican', 
    'anti-jacksonian, whig': 'republican', 
    'republican/whig': 'republican', 
    'republican 1st term': 'republican', 
    'independent democrat': 'democrat',
    'american': 'republican',
    'farm-labor': 'democrat',
    'farmer-labor': 'democrat', 
    'democratic-populist': 'democrat',
    'populist': 'democrat', 
    'ap': 'republican', 
    'republican, progressive': 'republican', 
    'progressive, republican': 'republican', 
    'conservative': 'democratic',
    'whig/know-nothing': 'republican'
}

# Function to map parties
def map_party(party):
    if pd.isna(party):
        return np.nan
    elif party in party_mapping:
        return party_mapping[party]
    else:
        # Leave unclear or multi-party affiliations as is
        return party

# Apply the mapping function
gov_data['party'] = gov_data['party'].apply(map_party)

# List of parties to replace with NaN
parties_to_nan = [
    'independent', np.nan, 'nonpartisan league, independent',
    'baptist', 'methodist', 'silver', 'readjuster',
    'greenback', 'union', 'unionist',
    'military', 'union democrat', 'democrat/unionist', 'provisional',
    'professional', 'know-nothing',
    'federal military rule', 'dfs', 'f-r',
    'federalist', 'anti-mason',
    'anti-masonic', "people's party and fusion party"
]

# Replace specified parties with NaN
gov_data['party'] = gov_data['party'].replace(parties_to_nan, np.nan)

In [None]:
# Update 'party' for Forrest Hood James in years 1979-1983 to 'democrat'
gov_data.loc[
    (gov_data['governor'] == 'Forrest Hood James') & (gov_data['year'].between(1979, 1983)),
    'party'
] = 'democrat'

# Update 'party' for Forrest Hood James in years 1995-1999 to 'republican'
gov_data.loc[
    (gov_data['governor'] == 'Forrest Hood James') & (gov_data['year'].between(1995, 1999)),
    'party'
] = 'republican'

# Update 'party' for Buddy Elson Roemer to 'republican' in 1992
gov_data.loc[
    (gov_data['governor'] == 'Buddy Elson Roemer') & (gov_data['party'] == 'democrat, republican'),
    'party'
] = 'republican'

# Update 'party' for Mills Edwin Godwin in 1970 to 'democrat'
gov_data.loc[
    (gov_data['governor'] == 'Mills Edwin Godwin') & (gov_data['year'] == 1970),
    'party'
] = 'democrat'

# Update 'party' for Mills Edwin Godwin in 1978 to 'republican'
gov_data.loc[
    (gov_data['governor'] == 'Mills Edwin Godwin') & (gov_data['year'] == 1978),
    'party'
] = 'republican'

# Update 'party' for John Franklin Shafroth to 'democrat'
gov_data.loc[gov_data['governor'] == 'John Franklin Shafroth', 'party'] = 'democrat'

# Update 'party' for George Rockingham Gilmer to 'republican'
gov_data.loc[gov_data['governor'] == 'George Rockingham Gilmer', 'party'] = 'republican'

# Update 'party' for Alexander Hamilton Stephens to 'democrat'
gov_data.loc[gov_data['governor'] == 'Alexander Hamilton Stephens', 'party'] = 'democrat'

# Update 'party' for John Calvin Brown to 'democrat'
gov_data.loc[gov_data['governor'] == 'John Calvin Brown', 'party'] = 'democrat'

# Update 'party' for David Peter Lewis to 'republican'
gov_data.loc[gov_data['governor'] == 'David Peter Lewis', 'party'] = 'republican'

# Update 'party' for John McAuley Palmer to NaN
gov_data.loc[gov_data['governor'] == 'John McAuley Palmer', 'party'] = np.nan

# Update 'party' for Henry Huntly Haight to 'democrat'
gov_data.loc[gov_data['governor'] == 'Henry Huntly Haight', 'party'] = 'democrat'

# Update 'party' for William Henry Bissell to 'republican'
gov_data.loc[gov_data['governor'] == 'William Henry Bissell', 'party'] = 'republican'

# Update 'party' for Charles Lynch to 'republican'
gov_data.loc[gov_data['governor'] == 'Charles Lynch', 'party'] = 'republican'

# Update 'party' for Edward Hazzard East to 'republican'
gov_data.loc[gov_data['governor'] == 'Edward Hazzard East', 'party'] = 'republican'

# Update 'party' for Levi Lincoln Jr. to 'republican'
gov_data.loc[gov_data['governor'] == 'Levi Lincoln Jr.', 'party'] = 'republican'

gov_data['gov_party'] = gov_data['party'].map({'democrat': 1, 'republican': 2})

gov_data = gov_data.drop(columns='party')

gov_data['year'] = gov_data['year'] - 1

In [None]:
# Check for duplicate state-year combinations
duplicate_check = gov_data.duplicated(subset=['year', 'state_abbrev'])

# If any duplicates exist
if duplicate_check.any():
    print("There are duplicate state-year combinations in the dataset.")
else:
    print("All state-year combinations are unique.")

# Display duplicate rows
duplicates = gov_data[gov_data.duplicated(subset=['year', 'state_abbrev'], keep=False)]
print(duplicates)

I dealt with duplicates in the data, i.e. transitional years where governors changes, by merging the governors with the same party, and dropping the governor's with the different party

In [None]:
# Group by 'year', 'state_abbrev', and 'party', and concatenate 'governor' names
gov_data = (
    gov_data.groupby(['year', 'state_abbrev', 'gov_party'], as_index=False)
    .agg({'governor': lambda x: ', '.join(x.unique()), **{col: 'first' for col in gov_data.columns if col not in ['year', 'state_abbrev', 'party', 'governor']}})
)

gov_data = gov_data.drop_duplicates(subset=['year', 'state_abbrev'])

# Ensure no duplicates remain
assert not gov_data.duplicated(subset=['year', 'state_abbrev']).any(), "Duplicates still exist in the gov data!"
print("All duplicates resolved.")

In [None]:
# Drop rows where 'gov_party' is NaN
gov_data = gov_data.dropna(subset=['gov_party'])

In [None]:
gov_data.to_csv(os.path.join(raw_data_dir, "gov_data_cleaned.csv"), index=False)