In [6]:
import pandas as pd
import numpy as np
import os

In [7]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
assert os.path.exists(parent_dir), "parent_dir does not exist"
data_dir = os.path.join(parent_dir, "./2_data/2_intermediate/political_data")
assert os.path.exists(data_dir), "Data directory does not exist"

In [17]:
# Specify the path to the PDF and extract the table
political_data = pd.read_csv(os.path.join(data_dir, "./political_composition.csv"))

In [18]:
print(political_data.head())

  state_abbrev  yr_rd2  shr_dem_in_sess  shr_rep_in_sess  dem_upphse  \
0           AL    1834         0.527778         0.472222    0.555556   
1           CT    1834         0.216597         0.783403    0.190476   
2           DE    1834         0.333333         0.666667    0.333333   
3          Fed    1834         0.557325         0.321656    0.446429   
4           GA    1834         0.704487         0.295513    0.692308   

   dem_lowhse  rep_upphse  rep_lowhse  gov_party  
0    0.500000    0.444444    0.500000        1.0  
1    0.242718    0.809524    0.757282        NaN  
2    0.333333    0.666667    0.666667        1.0  
3    0.581395    0.517857    0.279070        1.0  
4    0.716667    0.307692    0.283333        NaN  


In [20]:
# Rename column 'yr_rd2' to 'year'
political_data.rename(columns={'yr_rd2': 'year'}, inplace=True)

# Remove years with missing 'gov_party'
political_data['min_year'] = political_data.groupby('state_abbrev')['year'].transform(lambda x: x.min() if x.notna().any() else np.nan)
political_data['min_min_year'] = political_data.groupby('state_abbrev')['min_year'].transform('min')
political_data = political_data[political_data['year'] >= political_data['min_min_year']]

In [21]:
# Fill missing values based on the last observation within each group 
political_data.sort_values(by=['state_abbrev', 'year'], inplace=True)
political_data['state_abbrev'].fillna(method='ffill', inplace=True)

for col in ['shr_dem_in_sess', 'shr_rep_in_sess', 'dem_upphse', 'rep_upphse', 'dem_lowhse', 'rep_lowhse', 'gov_party']:
    political_data[col].fillna(method='ffill', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  political_data['state_abbrev'].fillna(method='ffill', inplace=True)
  political_data['state_abbrev'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  political_data[col].fillna(method='ffill', inplace=True)
  political_data[col].fillna(method='

In [22]:
print(political_data.head())

     state_abbrev  year  shr_dem_in_sess  shr_rep_in_sess  dem_upphse  \
2550           AK  1960           0.8750           0.1125       0.900   
2600           AK  1962           0.6125           0.3875       0.675   
2650           AK  1964           0.6250           0.3750       0.750   
2699           AK  1966           0.8000           0.2000       0.850   
2748           AK  1968           0.3375           0.6625       0.300   

      dem_lowhse  rep_upphse  rep_lowhse  gov_party  min_year  min_min_year  
2550       0.850       0.100       0.125        1.0      1960          1960  
2600       0.550       0.325       0.450        1.0      1960          1960  
2650       0.500       0.250       0.500        1.0      1960          1960  
2699       0.750       0.150       0.250        1.0      1960          1960  
2748       0.375       0.700       0.625        1.0      1960          1960  


In [23]:
# Round the 'year' and filter only those rows
political_data['yr_rd2'] = political_data['year'].round(2)
political_data = political_data[political_data['year'] == political_data['yr_rd2']]
political_data.drop(columns=['year'], inplace=True)

In [24]:
# Create division and census region (similar to the Stata program)
def create_division(row):
    if row['state_abbrev'] in [9, 23, 25, 33, 44, 50]:
        return 'NE'
    elif row['state_abbrev'] in [34, 36, 42]:
        return 'MA'
    elif row['state_abbrev'] in [17, 18, 26, 39, 55]:
        return 'ENC'
    elif row['state_abbrev'] in [19, 20, 27, 29, 31, 38, 46]:
        return 'WNC'
    elif row['state_abbrev'] in [10, 11, 12, 13, 24, 37, 45, 51, 54]:
        return 'SA'
    elif row['state_abbrev'] in [1, 21, 28, 47]:
        return 'ESC'
    elif row['state_abbrev'] in [5, 22, 40, 48]:
        return 'WSC'
    elif row['state_abbrev'] in [4, 8, 16, 35, 30, 49, 32, 56]:
        return 'M'
    elif row['state_abbrev'] in [2, 6, 15, 41, 53]:
        return 'P'
    else:
        return np.nan

political_data['division'] = political_data.apply(create_division, axis=1)

In [25]:
def create_region_from_division(row):
    if row['division'] in ['NE', 'MA']:
        return 1  # Northeast
    elif row['division'] in ['ENC', 'WNC']:
        return 2  # Midwest
    elif row['division'] in ['SA', 'ESC', 'WSC']:
        return 3  # South
    elif row['division'] in ['M', 'P']:
        return 4  # West
    else:
        return np.nan

political_data['censusregion'] = political_data.apply(create_region_from_division, axis=1)

# Keep relevant columns
political_data = political_data[['yr_rd2', 'state_abbrev', 'shr_dem_in_sess', 'shr_rep_in_sess', 'gov_party',
                                 'dem_upphse', 'rep_upphse', 'dem_lowhse', 'rep_lowhse', 'censusregion']]

# Rename for merging later
political_data.rename(columns={'state_abbrev': 'state'}, inplace=True)
polstate1 = political_data.add_suffix('1')

# Save to temporary files (in Python, you can save as a CSV or a temporary DataFrame)
polstate1['year'] = polstate1['yr_rd21']  # Add the year back for merging
temp = political_data.copy()

# Rename for merging with second state
polstate2 = temp.add_suffix('2')
polstate2['year'] = polstate2['yr_rd22']

# Save `polstate2` if needed as a file

In [27]:
# print(political_data.head())
print(polstate1.head())
# print(polstate2.head())

      yr_rd21 state1  shr_dem_in_sess1  shr_rep_in_sess1  gov_party1  \
2550     1960     AK            0.8750            0.1125         1.0   
2600     1962     AK            0.6125            0.3875         1.0   
2650     1964     AK            0.6250            0.3750         1.0   
2699     1966     AK            0.8000            0.2000         1.0   
2748     1968     AK            0.3375            0.6625         1.0   

      dem_upphse1  rep_upphse1  dem_lowhse1  rep_lowhse1  censusregion1  year  
2550        0.900        0.100        0.850        0.125            NaN  1960  
2600        0.675        0.325        0.550        0.450            NaN  1962  
2650        0.750        0.250        0.500        0.500            NaN  1964  
2699        0.850        0.150        0.750        0.250            NaN  1966  
2748        0.300        0.700        0.375        0.625            NaN  1968  
