**Created by:** Revekka Gershovich
**When:** Dec 3, 2024
**Purpose:** Cleaning state partisan composition data

Note that Republican and Democratic parties are actually a number of ancestor and closely aligned parties commonly considered together by political scientist merged into one. That is why there is data for Republican party before 1854. Democrats (1 in pres_gov_party) also contain a Jackson (Democrats) party (1829-1854). Republicans (2 in pres_gov_party) also contain Whig party (1834-1856), Anti-Jacksonian party (1824-1837), and Opposition Coalition which includes Whig + Republican + Free Soil parties (1850-1856). 

In [73]:
import os
import os.path as path
import pandas as pd
import numpy as np

In [74]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
intermed_data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(intermed_data_dir), "Data directory does not exist"
raw_data_dir = "./2_data/1_raw/political_data"
assert os.path.exists(raw_data_dir), "Data directory does not exist"

In [75]:
data_codes = data_codes = [f"{i:04d}" for i in range(1, 6)]

dataframes = {}

for data_code in data_codes:
    data_file = f"ICPSR_00016_2/DS{data_code}/00016-{data_code}-Data.csv"
    data_path = path.join(raw_data_dir, data_file)
    print(data_path)
    assert os.path.exists(data_path), "Data file does not exist"
    df = pd.read_csv(data_path)
    # Save the dataframe in the dictionary
    dataframes[f"df_{data_code}"] = df

    # Print confirmation
    print(f"DataFrame for {data_code} loaded successfully.")


./2_data/1_raw/political_data/ICPSR_00016_2/DS0001/00016-0001-Data.csv
DataFrame for 0001 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0002/00016-0002-Data.csv
DataFrame for 0002 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0003/00016-0003-Data.csv
DataFrame for 0003 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0004/00016-0004-Data.csv
DataFrame for 0004 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0005/00016-0005-Data.csv
DataFrame for 0005 loaded successfully.


In [76]:
df = dataframes["df_0001"]

In [77]:
df.columns

Index(['ICPR_STATE_CODE', 'X834_PARTY_OF_GOV', 'X834_UPP_HSE_DEM_SEATS',
       'X834_UPP_HSE_W_R_SEATS', 'X834_UPP_HSE_1_OTH_SEATS',
       'X834_UPP_HSE_2_OTH_SEATS', 'X834_UPP_HSE_TOTAL_SEATS',
       'X834_LOW_HSE_DEM_SEATS', 'X834_LOW_HSE_W_R_SEATS',
       'X834_LOW_HSE_1_OTH_SEATS',
       ...
       'X877_LOW_HSE_1_OTH_SEAT', 'X877_LOW_HSE_2_OTH_SEAT',
       'X878_UPP_HSE_DEM_SEAT', 'X878_UPP_HSE_W_R_SEAT',
       'X878_UPP_HSE_1_OTH_SEAT', 'X878_UPP_HSE_2_OTH_SEAT',
       'X878_LOW_HSE_DEM_SEAT', 'X878_LOW_HSE_W_R_SEAT',
       'X878_LOW_HSE_1_OTH_SEAT', 'X878_LOW_HSE_2_OTH_SEAT'],
      dtype='object', length=856)

In [78]:
df = df.replace([999, 9999], np.nan)

# Identify columns that end with "OTH_SEATS" or "OTH_SEAT"
columns_to_drop = [col for col in df.columns if col.endswith("OTH_SEATS") or col.endswith("OTH_SEAT")]

# Drop these columns from the DataFrame
df = df.drop(columns=columns_to_drop)

In [79]:
# Identify columns ending with '_SEAT'
seat_columns = [col for col in df.columns if col.endswith('_SEAT')]

# Correct the formatting by dividing these columns by 10
df[seat_columns] = df[seat_columns]/10

# The columns ending with SEAT actually represent percentages. I will use them to corroborate my own calculations of proportions.
renamed_columns = {col: col.replace('_SEAT', '_PCT') for col in seat_columns}
df = df.rename(columns=renamed_columns)

In [84]:
non_pct_columns = [col for col in df.columns if not col.endswith('_PCT')]

# Convert these columns to integers
df[non_pct_columns] = df[non_pct_columns].astype('Int64')  # 'Int64' handles NaNs gracefully for integers

In [85]:
df.head()

Unnamed: 0,ICPR_STATE_CODE,X834_PARTY_OF_GOV,X834_UPP_HSE_DEM_SEATS,X834_UPP_HSE_W_R_SEATS,X834_UPP_HSE_TOTAL_SEATS,X834_LOW_HSE_DEM_SEATS,X834_LOW_HSE_W_R_SEATS,X834_LOW_HSE_TOTAL_SEATS,X835_PARTY_OF_GOV,X835_UPP_HSE_DEM_SEATS,...,X876_LOW_HSE_DEM_PCT,X876_LOW_HSE_W_R_PCT,X877_UPP_HSE_DEM_PCT,X877_UPP_HSE_W_R_PCT,X877_LOW_HSE_DEM_PCT,X877_LOW_HSE_W_R_PCT,X878_UPP_HSE_DEM_PCT,X878_UPP_HSE_W_R_PCT,X878_LOW_HSE_DEM_PCT,X878_LOW_HSE_W_R_PCT
0,1,29.0,4,17,21,50,156,206,100.0,16.0,...,42.7,57.3,42.9,57.1,44.7,55.3,33.3,66.7,41.0,58.2
1,2,100.0,15,10,25,99,71,170,100.0,,...,20.5,79.5,12.9,87.1,34.4,64.2,0.0,64.5,14.6,43.0
2,3,29.0,1,33,40,80,285,368,29.0,18.0,...,22.9,74.2,12.5,87.5,28.7,71.2,10.0,85.0,16.7,77.1
3,4,100.0,12,0,12,163,63,226,100.0,11.0,...,46.0,54.0,33.3,66.7,40.9,59.1,16.7,83.3,35.8,60.2
4,5,,10,0,10,28,39,72,,2.0,...,13.9,86.1,22.2,77.8,25.0,75.0,30.6,69.4,23.6,76.4


In [86]:
columns_to_transform = [col for col in df.columns if col.startswith("X")]

# Melt the DataFrame to stack all relevant columns
melted_df = df.melt(id_vars=["ICPR_STATE_CODE"], 
                    value_vars=columns_to_transform, 
                    var_name="year_variable", 
                    value_name="value")

# Extract year and variable from the melted column
melted_df["year"] = melted_df["year_variable"].str.extract(r'X(\d{3,4})').astype(int) + 1000  # Convert year to 1800s format
melted_df["variable"] = melted_df["year_variable"].str.split("_", n=1).str[1]

# Drop the original column with combined year and variable
melted_df = melted_df.drop(columns=["year_variable"])

# Reshape the DataFrame: each unique variable becomes its column
reshaped_df = melted_df.pivot_table(index=["ICPR_STATE_CODE", "year"], 
                                    columns="variable", 
                                    values="value").reset_index()


In [83]:
print(reshaped_df['PARTY_OF_GOV'].unique())

[  29.  100.  310.  311.  200.   nan   35.  768. 1063.  843.  553.  315.
  636.  605.  330.]


In [88]:
reshaped_df.sample(5)

variable,ICPR_STATE_CODE,year,LOW_HSE_DEM_PCT,LOW_HSE_DEM_SEATS,LOW_HSE_TOTAL_SEATS,LOW_HSE_W_R_PCT,LOW_HSE_W_R_SEATS,PARTY_OF_GOV,UPP_HSE_DEM_PCT,UPP_HSE_DEM_SEATS,UPP_HSE_TOTAL_SEATS,UPP_HSE_W_R_PCT,UPP_HSE_W_R_SEATS
668,41,1878,91.0,91.0,100.0,3.0,3.0,100.0,93.9,31.0,33.0,0.0,0.0
478,23,1843,88.7,47.0,53.0,11.3,6.0,100.0,100.0,18.0,18.0,0.0,0.0
924,71,1875,80.0,64.0,80.0,15.0,12.0,100.0,50.0,20.0,40.0,15.0,6.0
865,54,1841,48.0,36.0,75.0,52.0,39.0,29.0,52.0,13.0,25.0,48.0,12.0
54,2,1843,66.4,85.0,128.0,33.6,43.0,100.0,88.0,22.0,25.0,12.0,3.0
