In [2]:
import os
import os.path as path
import pandas as pd
import numpy as np

In [3]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
intermed_data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(intermed_data_dir), "Data directory does not exist"
raw_data_dir = "./2_data/1_raw/political_data"
assert os.path.exists(raw_data_dir), "Data directory does not exist"

In [4]:
data_codes = data_codes = [f"{i:04d}" for i in range(1, 6)]

dataframes = {}

for data_code in data_codes:
    data_file = f"ICPSR_00016_2/DS{data_code}/00016-{data_code}-Data.csv"
    data_path = path.join(raw_data_dir, data_file)
    print(data_path)
    assert os.path.exists(data_path), "Data file does not exist"
    df = pd.read_csv(data_path)
    # Save the dataframe in the dictionary
    dataframes[f"df_{data_code}"] = df

    # Print confirmation
    print(f"DataFrame for {data_code} loaded successfully.")


./2_data/1_raw/political_data/ICPSR_00016_2/DS0001/00016-0001-Data.csv
DataFrame for 0001 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0002/00016-0002-Data.csv
DataFrame for 0002 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0003/00016-0003-Data.csv
DataFrame for 0003 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0004/00016-0004-Data.csv
DataFrame for 0004 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0005/00016-0005-Data.csv
DataFrame for 0005 loaded successfully.


In [5]:
dataframes["df_0001"].columns

Index(['ICPR_STATE_CODE', 'X834_PARTY_OF_GOV', 'X834_UPP_HSE_DEM_SEATS',
       'X834_UPP_HSE_W_R_SEATS', 'X834_UPP_HSE_1_OTH_SEATS',
       'X834_UPP_HSE_2_OTH_SEATS', 'X834_UPP_HSE_TOTAL_SEATS',
       'X834_LOW_HSE_DEM_SEATS', 'X834_LOW_HSE_W_R_SEATS',
       'X834_LOW_HSE_1_OTH_SEATS',
       ...
       'X877_LOW_HSE_1_OTH_SEAT', 'X877_LOW_HSE_2_OTH_SEAT',
       'X878_UPP_HSE_DEM_SEAT', 'X878_UPP_HSE_W_R_SEAT',
       'X878_UPP_HSE_1_OTH_SEAT', 'X878_UPP_HSE_2_OTH_SEAT',
       'X878_LOW_HSE_DEM_SEAT', 'X878_LOW_HSE_W_R_SEAT',
       'X878_LOW_HSE_1_OTH_SEAT', 'X878_LOW_HSE_2_OTH_SEAT'],
      dtype='object', length=856)

In [None]:
df = dataframes["df_0001"]

In [30]:
# # Filter the DataFrame to only include columns starting with "X878"
# filtered_df = df.loc[:, df.columns.str.startswith("X878_LOW_HSE")]
# print(filtered_df.columns)

filtered_df = df[['ICPR_STATE_CODE', 'X878_LOW_HSE_DEM_SEATS', 'X878_LOW_HSE_TOTAL_SEATS', 'X878_LOW_HSE_DEM_SEAT']]

# # Optionally, keep identifier columns like "ICPR_STATE_CODE"
# filtered_df = pd.concat([df["ICPR_STATE_CODE"], filtered_df], axis=1)

In [34]:
# print(filtered_df.columns)
expected_seat_percentage = (filtered_df["X878_LOW_HSE_DEM_SEATS"] / filtered_df["X878_LOW_HSE_TOTAL_SEATS"]) * 100

print(filtered_df.head())
print(expected_seat_percentage.head())

   ICPR_STATE_CODE  X878_LOW_HSE_DEM_SEATS  X878_LOW_HSE_TOTAL_SEATS  \
0                1                     100                       244   
1                2                      22                       151   
2                3                      40                       240   
3                4                     100                       279   
4                5                      17                        72   

   X878_LOW_HSE_DEM_SEAT  
0                    410  
1                    146  
2                    167  
3                    358  
4                    236  
0    40.983607
1    14.569536
2    16.666667
3    35.842294
4    23.611111
dtype: float64


In [7]:
columns_to_transform = [col for col in df.columns if col.startswith("X")]

# Melt the DataFrame to stack all relevant columns
melted_df = df.melt(id_vars=["ICPR_STATE_CODE"], 
                    value_vars=columns_to_transform, 
                    var_name="year_variable", 
                    value_name="value")

# Extract year and variable from the melted column
melted_df["year"] = melted_df["year_variable"].str.extract(r'X(\d{3,4})').astype(int) + 1000  # Convert year to 1800s format
melted_df["variable"] = melted_df["year_variable"].str.split("_", n=1).str[1]

# Drop the original column with combined year and variable
melted_df = melted_df.drop(columns=["year_variable"])

# Reshape the DataFrame: each unique variable becomes its column
reshaped_df = melted_df.pivot_table(index=["ICPR_STATE_CODE", "year"], 
                                    columns="variable", 
                                    values="value").reset_index()


In [8]:
reshaped_df.head()

variable,ICPR_STATE_CODE,year,LOW_HSE_1_OTH_SEAT,LOW_HSE_1_OTH_SEATS,LOW_HSE_2_OTH_SEAT,LOW_HSE_2_OTH_SEATS,LOW_HSE_DEM_SEAT,LOW_HSE_DEM_SEATS,LOW_HSE_TOTAL_SEATS,LOW_HSE_W_R_SEAT,...,PARTY_OF_GOV,UPP_HSE_1_OTH_SEAT,UPP_HSE_1_OTH_SEATS,UPP_HSE_2_OTH_SEAT,UPP_HSE_2_OTH_SEATS,UPP_HSE_DEM_SEAT,UPP_HSE_DEM_SEATS,UPP_HSE_TOTAL_SEATS,UPP_HSE_W_R_SEAT,UPP_HSE_W_R_SEATS
0,1,1834,0.0,0.0,0.0,0.0,243.0,50.0,206.0,757.0,...,29.0,0.0,0.0,0.0,0.0,190.0,4.0,21.0,810.0,17.0
1,1,1835,0.0,0.0,0.0,0.0,612.0,126.0,206.0,388.0,...,100.0,0.0,0.0,0.0,0.0,762.0,16.0,21.0,238.0,5.0
2,1,1836,0.0,0.0,0.0,0.0,646.0,133.0,206.0,354.0,...,100.0,0.0,0.0,0.0,0.0,810.0,17.0,21.0,190.0,4.0
3,1,1837,0.0,0.0,9999.0,999.0,660.0,140.0,212.0,340.0,...,100.0,0.0,0.0,0.0,0.0,667.0,14.0,21.0,333.0,7.0
4,1,1838,0.0,0.0,0.0,0.0,274.0,40.0,146.0,726.0,...,29.0,0.0,0.0,0.0,0.0,48.0,1.0,21.0,952.0,20.0
