**Created by:** Revekka 

**When:** Nov 18

**Purpose:** To clean data downloaded from here: https://en.wikipedia.org/wiki/Party_divisions_of_United_States_Congresses#cite_note-14
This is Federal Composition data. 

**Next file:** partisanStateFedDataMerge.ipynb

In [None]:
import pandas as pd
import os
import os.path as path
import numpy as np
import re

In [None]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
intermed_data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(intermed_data_dir), "Data directory does not exist"
raw_data_dir = "./2_data/1_raw/political_data/all_partisanComposition"
assert os.path.exists(raw_data_dir), "Data directory does not exist"

I am mapping in the next cell the following parties to republicans and democrats: 
Democratic: Anti_Admin (1-3 congress), Democratic_Republicans (4-18), Jacksonian (19-24)
Republican: Pro_Admin (1-3 congress), Federalists (4-18), Anti_Jacksonian (19-24), Whigs (25-33), Opposition (Whigs + nascent Republicans) (34)


In [None]:
df = pd.read_csv(os.path.join(raw_data_dir , "Wiki_fed_compostion.csv"))
# Set the second row as the header
df.columns = df.iloc[0]  # Set the second row as the header
df = df[1:]  # Drop the row that is now the header

# Reset the index if needed
df.reset_index(drop=True, inplace=True)

df.columns.values[0] = "Congress"
df.columns.values[1] = "Years"
df.columns.values[2] = "Total_Upphouse"
df.columns.values[3] = "Dem_Upphouse"
df.columns.values[4] = "Rep_Upphouse"
df.columns.values[5] = "Other_Upphouse"
df.columns.values[6] = "Vacancies_Upphouse"
df.columns.values[8] = "Total_Lowhouse"
df.columns.values[9] = "Dem_Lowhouse"
df.columns.values[10] = "Rep_Lowhouse"
df.columns.values[11] = "Other_Lowhouse"
df.columns.values[12] = "Vacancies_Lowhouse"
df.columns.values[13] = "President"
df.columns.values[14] = "Trifecta"

df = df.dropna(axis=1, how='all')
df = df.dropna(how='all')
df = df[df['Congress'] != 'Congress']

df.to_csv(os.path.join(intermed_data_dir, "Cleaned_Wiki_fed_compostion.csv"), index=False)

df = df.drop(["Other_Upphouse", "Vacancies_Upphouse", "Other_Lowhouse", "Vacancies_Lowhouse", "Trifecta"], axis=1)
df['Congress'] = df['Congress'].str.extract('(\d+)').astype(int)
# Create a dictionary that maps congress number to the years it spans
congress_years = {i: 1789 + 2 * (i - 1) - 1 for i in range(1, 120)} 
# I want the years to be even (i.e. second (middle) year of congress) to keep the dataset in line with state partisan composition data
df['year'] = df['Congress'].map(congress_years)
df = df.drop(["Years"], axis=1)
df['President'] = df['President'].fillna(method='ffill')
df = df.applymap(lambda x: re.sub(r'\[.*?\]', '', str(x)) if pd.notnull(x) else x)
df['year'] = df['year'].astype(int) 

# Duplicate the DataFrame
df_copy = df.copy()

# Increment the 'year' column in the duplicate by 1
df_copy['year'] += 1

# Concatenate the original DataFrame with the modified duplicate
df = pd.concat([df, df_copy], ignore_index=True)

df = df.sort_values(by='year').reset_index(drop=True)

In [None]:
# Handle only rows with '/' in 'Dem_Upphouse'
df.loc[df['Dem_Upphouse'].str.contains('/', na=False), 'Dem_Upphouse'] = \
    df.loc[df['Dem_Upphouse'].str.contains('/', na=False), 'Dem_Upphouse'].str.extract(r'/(\d+)', expand=False)

# Handle only rows with '/' in 'Rep_Upphouse'
df.loc[df['Rep_Upphouse'].str.contains('/', na=False), 'Rep_Upphouse'] = \
    df.loc[df['Rep_Upphouse'].str.contains('/', na=False), 'Rep_Upphouse'].str.extract(r'/(\d+)', expand=False)

In [None]:
# Handle only rows with '–' in 'Dem_Upphouse'
df.loc[df['Dem_Upphouse'].str.contains('–', na=False), 'Dem_Upphouse'] = \
    df.loc[df['Dem_Upphouse'].str.contains('–', na=False), 'Dem_Upphouse'].str.extract(r'–(\d+)', expand=False)

# Handle only rows with '–' in 'Rep_Upphouse'
df.loc[df['Rep_Upphouse'].str.contains('–', na=False), 'Rep_Upphouse'] = \
    df.loc[df['Rep_Upphouse'].str.contains('–', na=False), 'Rep_Upphouse'].str.extract(r'–(\d+)', expand=False)

In [None]:
# List of columns to convert to int type
columns_to_convert = ['Congress', 'Total_Upphouse', 'Dem_Upphouse', 'Rep_Upphouse',
                      'Total_Lowhouse', 'Dem_Lowhouse', 'Rep_Lowhouse']

# Convert columns to int type
df[columns_to_convert] = df[columns_to_convert].astype(int)

In [None]:
df.tail(10)

In [None]:
df['Total_Session'] = df['Total_Upphouse'] + df['Total_Lowhouse']
df['Dem_Session'] = df['Dem_Upphouse'] + df['Dem_Lowhouse']
df['Rep_Session'] = df['Rep_Upphouse'] + df['Rep_Lowhouse']

In [None]:
president_party_mapping = {
    'George Washington': None,  # No party
    'John Adams': 2,         # Federalist
    'Thomas Jefferson': 1,   # Democratic-Republican
    'James Madison': 1,      # Democratic-Republican
    'James Monroe': 1,       # Democratic-Republican
    'John Quincy Adams': 1,  # Democratic-Republican
    'Andrew Jackson': 1,        # Democratic
    'Martin Van Buren': 1,      # Democratic
    'William Henry Harrison': 2, # Whig (Republican in this case)
    'John Tyler': 2,         # No formal party after leaving Whigs
    'James K. Polk': 1,         # Democratic
    'Zachary Taylor': 2,        # Whig (Republican in this case)
    'Millard Fillmore': 2,   # Whig
    'Franklin Pierce': 1,       # Democratic
    'James Buchanan': 1,        # Democratic
    'Abraham Lincoln': 2,       # Republican
    'Andrew Johnson': 1,        # Democratic (when not affiliated with Lincoln)
    'Ulysses S. Grant': 2,      # Republican
    'Rutherford B. Hayes': 2,   # Republican
    'James A. Garfield': 2,     # Republican
    'Chester A. Arthur': 2,     # Republican
    'Grover Cleveland': 1,      # Democratic
    'Benjamin Harrison': 2,     # Republican
    'William McKinley': 2,      # Republican
    'Theodore Roosevelt': 2,    # Republican
    'William Howard Taft': 2,   # Republican
    'Woodrow Wilson': 1,        # Democratic
    'Warren G. Harding': 2,     # Republican
    'Calvin Coolidge': 2,       # Republican
    'Herbert Hoover': 2,        # Republican
    'Franklin D. Roosevelt': 1, # Democratic
    'Harry S. Truman': 1,       # Democratic
    'Dwight D. Eisenhower': 2,  # Republican
    'John F. Kennedy': 1,       # Democratic
    'Lyndon B. Johnson': 1,     # Democratic
    'Richard Nixon': 2,         # Republican
    'Gerald Ford': 2,           # Republican
    'Jimmy Carter': 1,          # Democratic
    'Ronald Reagan': 2,         # Republican
    'George H. W. Bush': 2,     # Republican
    'Bill Clinton': 1,          # Democratic
    'George W. Bush': 2,        # Republican
    'Barack Obama': 1,          # Democratic
    'Donald Trump': 2,          # Republican
    'Joe Biden': 1              # Democratic
}


# Create a new column in your DataFrame using the mapping
df['president_party'] = df['President'].map(president_party_mapping).fillna('NA')

# Convert 'NA' to pd.NA and change the type to integer
df['president_party'] = df['president_party'].replace('NA', pd.NA).astype('Int32')

'year', 'shr_dem_in_sess', 'shr_rep_in_sess', 'dem_lowhse', 'rep_lowhse', 'dem_upphse', 'rep_upphse', 'president_party'

In [None]:
df['shr_dem_in_sess'] = df['Dem_Session'] / df['Total_Session']
df['shr_rep_in_sess'] = df['Rep_Session'] / df['Total_Session']
df['dem_lowhse'] = df['Dem_Lowhouse'] / df['Total_Lowhouse']
df['rep_lowhse'] = df['Rep_Lowhouse'] / df['Total_Lowhouse']
df['dem_upphse'] = df['Dem_Upphouse'] / df['Total_Upphouse']
df['rep_upphse'] = df['Rep_Upphouse'] / df['Total_Upphouse']

In [None]:
df = df[['year', 'shr_dem_in_sess', 'shr_rep_in_sess', 'dem_lowhse', 'rep_lowhse', 'dem_upphse', 'rep_upphse', 'president_party']]

In [None]:
df['year'] = df['year'].astype(int)
df['year'] = df['year']-2
df = df[df['year'] >= 1833]

In [None]:
df.head()

In [None]:
output_file = path.join(intermed_data_dir, "federal_political_composition.csv")
df.to_csv(output_file, index=False)