In [65]:
import pandas as pd
import numpy as np
import os

In [66]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
assert os.path.exists(parent_dir), "parent_dir does not exist"
data_dir = os.path.join(parent_dir, "./2_data/2_intermediate/political_data")
assert os.path.exists(data_dir), "Data directory does not exist"

In [67]:
# Specify the path to the PDF and extract the table
political_data = pd.read_csv(os.path.join(data_dir, "./political_composition.csv"))

In [68]:
print(political_data.head())

  state_abbrev  yr_rd2  shr_dem_in_sess  shr_rep_in_sess  dem_upphse  \
0           AL    1834         0.527778         0.472222    0.555556   
1           CT    1834         0.216597         0.783403    0.190476   
2           DE    1834         0.333333         0.666667    0.333333   
3          Fed    1834         0.557325         0.321656    0.446429   
4           GA    1834         0.704487         0.295513    0.692308   

   dem_lowhse  rep_upphse  rep_lowhse  gov_party  
0    0.500000    0.444444    0.500000        1.0  
1    0.242718    0.809524    0.757282        NaN  
2    0.333333    0.666667    0.666667        1.0  
3    0.581395    0.517857    0.279070        1.0  
4    0.716667    0.307692    0.283333        NaN  


In [69]:
# Rename column 'yr_rd2' to 'year'
political_data.rename(columns={'yr_rd2': 'year'}, inplace=True)

# Step 1: Calculate `min_year` for each `state_abbrev` where `gov_party` is not missing
political_data['min_year'] = political_data.loc[
    political_data['gov_party'].notna()
].groupby('state_abbrev')['year'].transform('min')

# Step 2: Propagate `min_min_year` across all rows within each `state_abbrev` group
political_data['min_min_year'] = political_data.groupby('state_abbrev')['min_year'].transform('min')

# Step 3: Drop rows where `year` is less than `min_min_year`
political_data = political_data[political_data['year'] >= political_data['min_min_year']]

# Optional: Drop the intermediate columns if not needed
political_data.drop(columns=['min_year', 'min_min_year'], inplace=True)




In [70]:
# Step 1: Create a new column 'yr_rd2' by rounding 'year' to 2 decimal places (though in practice, 'year' is likely an integer)
political_data['yr_rd2'] = political_data['year'].round(2)

# Step 2: Keep only rows where 'year' is equal to 'yr_rd2'
political_data = political_data[political_data['year'] == political_data['yr_rd2']]

In [71]:
print(political_data.sample(5))

     state_abbrev  year  shr_dem_in_sess  shr_rep_in_sess  dem_upphse  \
175            NY  1846         0.579700         0.420300    0.725806   
2465           IL  1956         0.428105         0.571895    0.372549   
2648           WV  1962         0.800625         0.199375    0.781250   
2637           PA  1962         0.509524         0.490476    0.500000   
1258           AL  1904         0.976099         0.023901    0.971429   

      dem_lowhse  rep_upphse  rep_lowhse  gov_party  yr_rd2  
175     0.433594    0.274194    0.566406        1.0    1846  
2465    0.483660    0.627451    0.516340        2.0    1956  
2648    0.820000    0.218750    0.180000        1.0    1962  
2637    0.519048    0.500000    0.480952        1.0    1962  
1258    0.980769    0.028571    0.019231        1.0    1904  


In [72]:
# Step 1: Sort by 'state_abbrev' and 'year' to ensure data is ordered correctly
political_data.sort_values(by=['state_abbrev', 'year'], inplace=True)

# Step 2: Fill missing string values for 'state_abbrev' within each group
political_data['state_abbrev'] = political_data.groupby('state_abbrev')['state_abbrev'].ffill()

# Step 3: Fill missing numerical values for specified columns within each group
columns_to_fill = ['shr_dem_in_sess', 'shr_rep_in_sess', 'dem_upphse', 'rep_upphse', 'dem_lowhse', 'rep_lowhse', 'gov_party']
political_data[columns_to_fill] = political_data.groupby('state_abbrev')[columns_to_fill].ffill()

# Step 3: Drop the 'year' column
political_data.drop(columns=['year'], inplace=True)

In [73]:
print(political_data.sample(5))

     state_abbrev  shr_dem_in_sess  shr_rep_in_sess  dem_upphse  dem_lowhse  \
3428           SC         0.616410         0.379558    0.652174    0.580645   
1769           PA         0.142790         0.857210    0.152917    0.132664   
826            CO         0.276295         0.723705    0.307692    0.244898   
3649           GA         0.529365         0.467857    0.464286    0.594444   
309            WI         0.693333         0.306667    0.720000    0.666667   

      rep_upphse  rep_lowhse  gov_party  yr_rd2  
3428    0.347826    0.411290        2.0    1994  
1769    0.847083    0.867336        2.0    1924  
826     0.692308    0.755102        1.0    1884  
3649    0.535714    0.400000        2.0    2004  
309     0.280000    0.333333        1.0    1854  


In [74]:
# Create a function to assign divisions based on state abbreviations
def create_division(state_abbrev):
    if state_abbrev in ['CT', 'ME', 'MA', 'NH', 'RI', 'VT']:
        return 'NE'  # Northeast
    elif state_abbrev in ['NJ', 'NY', 'PA']:
        return 'MA'  # Mid-Atlantic
    elif state_abbrev in ['IL', 'IN', 'MI', 'OH', 'WI']:
        return 'ENC'  # East North Central
    elif state_abbrev in ['IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD']:
        return 'WNC'  # West North Central
    elif state_abbrev in ['DE', 'DC', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'WV']:
        return 'SA'  # South Atlantic
    elif state_abbrev in ['AL', 'KY', 'MS', 'TN']:
        return 'ESC'  # East South Central
    elif state_abbrev in ['AR', 'LA', 'OK', 'TX']:
        return 'WSC'  # West South Central
    elif state_abbrev in ['AZ', 'CO', 'ID', 'MT', 'NM', 'UT', 'NV', 'WY']:
        return 'M'  # Mountain
    elif state_abbrev in ['AK', 'CA', 'HI', 'OR', 'WA']:
        return 'P'  # Pacific
    else:
        return None  # For any state abbreviation not in the defined groups

# Apply the function to create the 'division' column
political_data['division'] = political_data['state_abbrev'].apply(create_division)

In [75]:
print(political_data.sample(5))

     state_abbrev  shr_dem_in_sess  shr_rep_in_sess  dem_upphse  dem_lowhse  \
723            ME         0.182725         0.817275    0.064516    0.300933   
3470           NM         0.650000         0.350000    0.642857    0.657143   
729            NE         0.207444         0.792556    0.233796    0.181092   
212            VA         0.582206         0.417794    0.671875    0.492537   
3948           LA         0.380259         0.619741    0.333333    0.427184   

      rep_upphse  rep_lowhse  gov_party  yr_rd2 division  
723     0.935484    0.699067        2.0    1878       NE  
3470    0.357143    0.342857        2.0    1996        M  
729     0.766204    0.818908        2.0    1878      WNC  
212     0.328125    0.507463        1.0    1848       SA  
3948    0.666667    0.572816        2.0    2016      WSC  


In [76]:
def create_region_from_division(row):
    if row['division'] in ['NE', 'MA']:
        return 1  # Northeast
    elif row['division'] in ['ENC', 'WNC']:
        return 2  # Midwest
    elif row['division'] in ['SA', 'ESC', 'WSC']:
        return 3  # South
    elif row['division'] in ['M', 'P']:
        return 4  # West
    else:
        return np.nan

political_data['censusregion'] = political_data.apply(create_region_from_division, axis=1)

In [77]:
# Keep relevant columns
political_data = political_data[['yr_rd2', 'state_abbrev', 'shr_dem_in_sess', 'shr_rep_in_sess', 'gov_party',
                                 'dem_upphse', 'rep_upphse', 'dem_lowhse', 'rep_lowhse', 'censusregion']]

political_data.rename(columns={'state_abbrev': 'state'}, inplace=True)

In [79]:
print(political_data.sample(5))

      yr_rd2 state  shr_dem_in_sess  shr_rep_in_sess  gov_party  dem_upphse  \
3741    2008    AR         0.760714         0.239286        1.0    0.771429   
1863    1928    NY         0.450196         0.549804        1.0    0.480392   
3501    1998    IA         0.445000         0.555000        2.0    0.430000   
2459    1956    DE         0.738655         0.261345        2.0    0.705882   
1069    1894    VA         1.000000         0.000000        1.0    1.000000   

      rep_upphse  dem_lowhse  rep_lowhse  censusregion  
3741    0.228571    0.750000    0.250000           3.0  
1863    0.519608    0.420000    0.580000           1.0  
3501    0.570000    0.460000    0.540000           2.0  
2459    0.294118    0.771429    0.228571           3.0  
1069    0.000000    1.000000    0.000000           3.0  


In [None]:
# Step 1: Create a copy of the DataFrame for the 'polstate1' equivalent
polstate1 = political_data.copy()

# Step 2: Rename columns to add a suffix '1' for merging
columns_to_rename = ['state', 'shr_dem_in_sess', 'shr_rep_in_sess', 'gov_party',
                     'dem_upphse', 'rep_upphse', 'dem_lowhse', 'rep_lowhse', 'censusregion']

polstate1.rename(columns={col: col + '1' for col in columns_to_rename}, inplace=True)

# Step 3: Rename 'yr_rd2' to 'year' in 'polstate1'
polstate1.rename(columns={'yr_rd2': 'year'}, inplace=True)

# Step 4: Create another copy of the DataFrame for 'polstate2' equivalent
polstate2 = political_data.copy()

# Step 5: Rename columns to add a suffix '2' for merging
polstate2.rename(columns={col: col + '2' for col in columns_to_rename}, inplace=True)

# Step 6: Rename 'yr_rd2' to 'year' in 'polstate2'
polstate2.rename(columns={'yr_rd2': 'year'}, inplace=True)

In [85]:
state_fips = pd.read_csv(os.path.join(parent_dir, "2_data/1_raw/state_fips.csv"))
