Created by: Revekka 
When: Nov 18
Purpose: To clean data downloaded from here: https://en.wikipedia.org/wiki/Party_divisions_of_United_States_Congresses#cite_note-14
This is Federal Composition data. 

In [2]:
import pandas as pd
import os
import os.path as path
import numpy as np
import re

In [3]:
# parent_dir = os.path.abspath("/Users/rivka666/Dropbox (MIT)/StateLaws")
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
intermediate_political_data_dir = ("./2_data/2_intermediate/political_data")
raw_political_data_dir = ("./2_data/1_raw/political_data")
assert os.path.exists(intermediate_political_data_dir), "intermediate_political_data_dir does not exist"


I am mapping in the next cell the following parties to republicans and democrats: 
Democratic: Anti_Admin (1-3 congress), Democratic_Republicans (4-18), Jacksonian (19-24)
Republican: Pro_Admin (1-3 congress), Federalists (4-18), Anti_Jacksonian (19-24), Whigs (25-33), Opposition (Whigs + nascent Republicans) (34)


In [4]:
df = pd.read_csv(os.path.join(raw_political_data_dir , "Wiki_fed_compostion.csv"))
# Set the second row as the header
df.columns = df.iloc[0]  # Set the second row as the header
df = df[1:]  # Drop the row that is now the header

# Reset the index if needed
df.reset_index(drop=True, inplace=True)

df.columns.values[0] = "Congress"
df.columns.values[1] = "Years"
df.columns.values[2] = "Total_Upphouse"
df.columns.values[3] = "Dem_Upphouse"
df.columns.values[4] = "Rep_Upphouse"
df.columns.values[5] = "Other_Upphouse"
df.columns.values[6] = "Vacancies_Upphouse"
df.columns.values[8] = "Total_Lowhouse"
df.columns.values[9] = "Dem_Lowhouse"
df.columns.values[10] = "Rep_Lowhouse"
df.columns.values[11] = "Other_Lowhouse"
df.columns.values[12] = "Vacancies_Lowhouse"
df.columns.values[13] = "President"
df.columns.values[14] = "Trifecta"

df = df.dropna(axis=1, how='all')
df = df.dropna(how='all')
df = df[df['Congress'] != 'Congress']

df.to_csv(os.path.join(intermediate_political_data_dir, "Cleaned_Wiki_fed_compostion.csv"), index=False)

In [5]:
df = df.drop(["Other_Upphouse", "Vacancies_Upphouse", "Other_Lowhouse", "Vacancies_Lowhouse", "Trifecta"], axis=1)
df['Congress'] = df['Congress'].str.extract('(\d+)').astype(int)
# Create a dictionary that maps congress number to the years it spans
congress_years = {i: 1789 + 2 * (i - 1) + 1 for i in range(1, 120)} 
# I want the years to be even (i.e. second (middle) year of congress) to keep the dataset in line with state partisan composition data
df['yr_rd2'] = df['Congress'].map(congress_years)
df['yr_rd2'] = df['yr_rd2'].astype(int) 
df = df.drop(["Years"], axis=1)
df['President'] = df['President'].fillna(method='ffill')
df = df.applymap(lambda x: re.sub(r'\[.*?\]', '', str(x)) if pd.notnull(x) else x)

  df['President'] = df['President'].fillna(method='ffill')
  df = df.applymap(lambda x: re.sub(r'\[.*?\]', '', str(x)) if pd.notnull(x) else x)


In [6]:
# Handle only rows with '/' in 'Dem_Upphouse'
df.loc[df['Dem_Upphouse'].str.contains('/', na=False), 'Dem_Upphouse'] = \
    df.loc[df['Dem_Upphouse'].str.contains('/', na=False), 'Dem_Upphouse'].str.extract(r'/(\d+)', expand=False)

# Handle only rows with '/' in 'Rep_Upphouse'
df.loc[df['Rep_Upphouse'].str.contains('/', na=False), 'Rep_Upphouse'] = \
    df.loc[df['Rep_Upphouse'].str.contains('/', na=False), 'Rep_Upphouse'].str.extract(r'/(\d+)', expand=False)

In [7]:
# Handle only rows with '–' in 'Dem_Upphouse'
df.loc[df['Dem_Upphouse'].str.contains('–', na=False), 'Dem_Upphouse'] = \
    df.loc[df['Dem_Upphouse'].str.contains('–', na=False), 'Dem_Upphouse'].str.extract(r'–(\d+)', expand=False)

# Handle only rows with '–' in 'Rep_Upphouse'
df.loc[df['Rep_Upphouse'].str.contains('–', na=False), 'Rep_Upphouse'] = \
    df.loc[df['Rep_Upphouse'].str.contains('–', na=False), 'Rep_Upphouse'].str.extract(r'–(\d+)', expand=False)

In [8]:
# List of columns to convert to int type
columns_to_convert = ['Congress', 'Total_Upphouse', 'Dem_Upphouse', 'Rep_Upphouse',
                      'Total_Lowhouse', 'Dem_Lowhouse', 'Rep_Lowhouse']

# Convert columns to int type
df[columns_to_convert] = df[columns_to_convert].astype(int)

In [9]:
df.tail(10)

Unnamed: 0,Congress,Total_Upphouse,Dem_Upphouse,Rep_Upphouse,Total_Lowhouse,Dem_Lowhouse,Rep_Lowhouse,President,yr_rd2
114,110,100,49,49,435,233,202,George W. Bush,2008
115,111,100,58,42,435,257,178,Barack Obama,2010
116,112,100,51,47,435,193,242,Barack Obama,2012
117,113,100,53,45,435,201,234,Barack Obama,2014
118,114,100,44,54,435,188,247,Barack Obama,2016
119,115,100,47,52,435,194,241,Donald Trump,2018
120,116,100,46,52,435,235,200,Donald Trump,2020
121,117,100,48,50,435,222,213,Joe Biden,2022
122,118,100,48,49,435,213,221,Joe Biden,2024
123,119,100,45,53,435,212,218,Donald Trump,2026


In [10]:
df['Total_Session'] = df['Total_Upphouse'] + df['Total_Lowhouse']
df['Dem_Session'] = df['Dem_Upphouse'] + df['Dem_Lowhouse']
df['Rep_Session'] = df['Rep_Upphouse'] + df['Rep_Lowhouse']

In [11]:
president_party_mapping = {
    'George Washington': None,  # No party
    'John Adams': 2,         # Federalist
    'Thomas Jefferson': 1,   # Democratic-Republican
    'James Madison': 1,      # Democratic-Republican
    'James Monroe': 1,       # Democratic-Republican
    'John Quincy Adams': 1,  # Democratic-Republican
    'Andrew Jackson': 1,        # Democratic
    'Martin Van Buren': 1,      # Democratic
    'William Henry Harrison': 2, # Whig (Republican in this case)
    'John Tyler': 2,         # No formal party after leaving Whigs
    'James K. Polk': 1,         # Democratic
    'Zachary Taylor': 2,        # Whig (Republican in this case)
    'Millard Fillmore': 2,   # Whig
    'Franklin Pierce': 1,       # Democratic
    'James Buchanan': 1,        # Democratic
    'Abraham Lincoln': 2,       # Republican
    'Andrew Johnson': 1,        # Democratic (when not affiliated with Lincoln)
    'Ulysses S. Grant': 2,      # Republican
    'Rutherford B. Hayes': 2,   # Republican
    'James A. Garfield': 2,     # Republican
    'Chester A. Arthur': 2,     # Republican
    'Grover Cleveland': 1,      # Democratic
    'Benjamin Harrison': 2,     # Republican
    'William McKinley': 2,      # Republican
    'Theodore Roosevelt': 2,    # Republican
    'William Howard Taft': 2,   # Republican
    'Woodrow Wilson': 1,        # Democratic
    'Warren G. Harding': 2,     # Republican
    'Calvin Coolidge': 2,       # Republican
    'Herbert Hoover': 2,        # Republican
    'Franklin D. Roosevelt': 1, # Democratic
    'Harry S. Truman': 1,       # Democratic
    'Dwight D. Eisenhower': 2,  # Republican
    'John F. Kennedy': 1,       # Democratic
    'Lyndon B. Johnson': 1,     # Democratic
    'Richard Nixon': 2,         # Republican
    'Gerald Ford': 2,           # Republican
    'Jimmy Carter': 1,          # Democratic
    'Ronald Reagan': 2,         # Republican
    'George H. W. Bush': 2,     # Republican
    'Bill Clinton': 1,          # Democratic
    'George W. Bush': 2,        # Republican
    'Barack Obama': 1,          # Democratic
    'Donald Trump': 2,          # Republican
    'Joe Biden': 1              # Democratic
}


# Create a new column in your DataFrame using the mapping
df['president_party'] = df['President'].map(president_party_mapping).fillna('NA')

# Convert 'NA' to pd.NA and change the type to integer
df['president_party'] = df['president_party'].replace('NA', pd.NA).astype('Int32')

'yr_rd2', 'shr_dem_in_sess', 'shr_rep_in_sess', 'dem_lowhse', 'rep_lowhse', 'dem_upphse', 'rep_upphse', 'president_party'

In [12]:
df['shr_dem_in_sess'] = df['Dem_Session'] / df['Total_Session']
df['shr_rep_in_sess'] = df['Rep_Session'] / df['Total_Session']
df['dem_lowhse'] = df['Dem_Lowhouse'] / df['Total_Lowhouse']
df['rep_lowhse'] = df['Rep_Lowhouse'] / df['Total_Lowhouse']
df['dem_upphse'] = df['Dem_Upphouse'] / df['Total_Upphouse']
df['rep_upphse'] = df['Rep_Upphouse'] / df['Total_Upphouse']

In [13]:
df = df[['yr_rd2', 'shr_dem_in_sess', 'shr_rep_in_sess', 'dem_lowhse', 'rep_lowhse', 'dem_upphse', 'rep_upphse', 'president_party']]

In [14]:
df['yr_rd2'] = df['yr_rd2'].astype(int)
df['yr_rd2'] = df['yr_rd2']-2
df = df[df['yr_rd2'] >= 1833]

In [15]:
df.head()

Unnamed: 0,yr_rd2,shr_dem_in_sess,shr_rep_in_sess,dem_lowhse,rep_lowhse,dem_upphse,rep_upphse,president_party
25,1834,0.57483,0.336735,0.590909,0.309917,0.5,0.461538,1
27,1836,0.554422,0.397959,0.528926,0.413223,0.673077,0.326923,1
28,1838,0.527211,0.445578,0.516529,0.450413,0.576923,0.423077,1
29,1840,0.408163,0.581633,0.404959,0.586777,0.423077,0.557692,2
30,1842,0.618182,0.367273,0.659193,0.32287,0.442308,0.557692,2


In [16]:
output_file = path.join(intermediate_political_data_dir, "federal_political_composition.csv")
df.to_csv(output_file, index=False)