In [2]:
import os
import os.path as path
import pandas as pd
import numpy as np

In [3]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
intermed_data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(intermed_data_dir), "Data directory does not exist"
raw_data_dir = "./2_data/1_raw/political_data"
assert os.path.exists(raw_data_dir), "Data directory does not exist"

In [58]:
data_codes = data_codes = [f"{i:04d}" for i in range(1, 6)]

dataframes = {}

for data_code in data_codes:
    data_file = f"ICPSR_00016_2/DS{data_code}/00016-{data_code}-Data.csv"
    data_path = path.join(raw_data_dir, data_file)
    print(data_path)
    assert os.path.exists(data_path), "Data file does not exist"
    df = pd.read_csv(data_path)
    # Save the dataframe in the dictionary
    dataframes[f"df_{data_code}"] = df

    # Print confirmation
    print(f"DataFrame for {data_code} loaded successfully.")


./2_data/1_raw/political_data/ICPSR_00016_2/DS0001/00016-0001-Data.csv
DataFrame for 0001 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0002/00016-0002-Data.csv
DataFrame for 0002 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0003/00016-0003-Data.csv
DataFrame for 0003 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0004/00016-0004-Data.csv
DataFrame for 0004 loaded successfully.
./2_data/1_raw/political_data/ICPSR_00016_2/DS0005/00016-0005-Data.csv
DataFrame for 0005 loaded successfully.


In [59]:
df = dataframes["df_0001"]

In [60]:
df = df.replace([999, 9999], np.nan)

# Identify columns that end with "OTH_SEATS" or "OTH_SEAT"
columns_to_drop = [col for col in df.columns if col.endswith("OTH_SEATS") or col.endswith("OTH_SEAT")]

# Drop these columns from the DataFrame
df = df.drop(columns=columns_to_drop)

In [62]:
# Identify columns ending with '_SEAT'
seat_columns = [col for col in df.columns if col.endswith('_SEAT')]

# Correct the formatting by dividing these columns by 10
df[seat_columns] = df[seat_columns]/10

# View the corrected DataFrame
print(df[seat_columns].head())

   X834_UPP_HSE_DEM_SEAT  X834_UPP_HSE_W_R_SEAT  X834_LOW_HSE_DEM_SEAT  \
0                   19.0                   81.0                   24.3   
1                   60.0                   40.0                   58.2   
2                    2.5                   82.5                   21.7   
3                  100.0                    0.0                   72.1   
4                  100.0                    0.0                   38.9   

   X834_LOW_HSE_W_R_SEAT  X835_UPP_HSE_DEM_SEAT  X835_UPP_HSE_W_R_SEAT  \
0                   75.7                   76.2                   23.8   
1                   41.8                    NaN                    NaN   
2                   77.4                   43.9                   53.7   
3                   27.9                   91.7                    8.3   
4                   54.2                   33.3                   66.7   

   X835_LOW_HSE_DEM_SEAT  X835_LOW_HSE_W_R_SEAT  X836_UPP_HSE_DEM_SEAT  \
0                   61.2            

In [63]:
df.head()

Unnamed: 0,ICPR_STATE_CODE,X834_PARTY_OF_GOV,X834_UPP_HSE_DEM_SEATS,X834_UPP_HSE_W_R_SEATS,X834_UPP_HSE_TOTAL_SEATS,X834_LOW_HSE_DEM_SEATS,X834_LOW_HSE_W_R_SEATS,X834_LOW_HSE_TOTAL_SEATS,X835_PARTY_OF_GOV,X835_UPP_HSE_DEM_SEATS,...,X876_LOW_HSE_DEM_SEAT,X876_LOW_HSE_W_R_SEAT,X877_UPP_HSE_DEM_SEAT,X877_UPP_HSE_W_R_SEAT,X877_LOW_HSE_DEM_SEAT,X877_LOW_HSE_W_R_SEAT,X878_UPP_HSE_DEM_SEAT,X878_UPP_HSE_W_R_SEAT,X878_LOW_HSE_DEM_SEAT,X878_LOW_HSE_W_R_SEAT
0,1,29.0,4.0,17.0,21.0,50.0,156.0,206.0,100.0,16.0,...,42.7,57.3,42.9,57.1,44.7,55.3,33.3,66.7,41.0,58.2
1,2,100.0,15.0,10.0,25.0,99.0,71.0,170.0,100.0,,...,20.5,79.5,12.9,87.1,34.4,64.2,0.0,64.5,14.6,43.0
2,3,29.0,1.0,33.0,40.0,80.0,285.0,368.0,29.0,18.0,...,22.9,74.2,12.5,87.5,28.7,71.2,10.0,85.0,16.7,77.1
3,4,100.0,12.0,0.0,12.0,163.0,63.0,226.0,100.0,11.0,...,46.0,54.0,33.3,66.7,40.9,59.1,16.7,83.3,35.8,60.2
4,5,,10.0,0.0,10.0,28.0,39.0,72.0,,2.0,...,13.9,86.1,22.2,77.8,25.0,75.0,30.6,69.4,23.6,76.4


In [7]:
columns_to_transform = [col for col in df.columns if col.startswith("X")]

# Melt the DataFrame to stack all relevant columns
melted_df = df.melt(id_vars=["ICPR_STATE_CODE"], 
                    value_vars=columns_to_transform, 
                    var_name="year_variable", 
                    value_name="value")

# Extract year and variable from the melted column
melted_df["year"] = melted_df["year_variable"].str.extract(r'X(\d{3,4})').astype(int) + 1000  # Convert year to 1800s format
melted_df["variable"] = melted_df["year_variable"].str.split("_", n=1).str[1]

# Drop the original column with combined year and variable
melted_df = melted_df.drop(columns=["year_variable"])

# Reshape the DataFrame: each unique variable becomes its column
reshaped_df = melted_df.pivot_table(index=["ICPR_STATE_CODE", "year"], 
                                    columns="variable", 
                                    values="value").reset_index()


In [8]:
reshaped_df.head()

variable,ICPR_STATE_CODE,year,LOW_HSE_1_OTH_SEAT,LOW_HSE_1_OTH_SEATS,LOW_HSE_2_OTH_SEAT,LOW_HSE_2_OTH_SEATS,LOW_HSE_DEM_SEAT,LOW_HSE_DEM_SEATS,LOW_HSE_TOTAL_SEATS,LOW_HSE_W_R_SEAT,...,PARTY_OF_GOV,UPP_HSE_1_OTH_SEAT,UPP_HSE_1_OTH_SEATS,UPP_HSE_2_OTH_SEAT,UPP_HSE_2_OTH_SEATS,UPP_HSE_DEM_SEAT,UPP_HSE_DEM_SEATS,UPP_HSE_TOTAL_SEATS,UPP_HSE_W_R_SEAT,UPP_HSE_W_R_SEATS
0,1,1834,0.0,0.0,0.0,0.0,243.0,50.0,206.0,757.0,...,29.0,0.0,0.0,0.0,0.0,190.0,4.0,21.0,810.0,17.0
1,1,1835,0.0,0.0,0.0,0.0,612.0,126.0,206.0,388.0,...,100.0,0.0,0.0,0.0,0.0,762.0,16.0,21.0,238.0,5.0
2,1,1836,0.0,0.0,0.0,0.0,646.0,133.0,206.0,354.0,...,100.0,0.0,0.0,0.0,0.0,810.0,17.0,21.0,190.0,4.0
3,1,1837,0.0,0.0,9999.0,999.0,660.0,140.0,212.0,340.0,...,100.0,0.0,0.0,0.0,0.0,667.0,14.0,21.0,333.0,7.0
4,1,1838,0.0,0.0,0.0,0.0,274.0,40.0,146.0,726.0,...,29.0,0.0,0.0,0.0,0.0,48.0,1.0,21.0,952.0,20.0
