**Created by:** Revekka Gershovich **When:** Dic 4, 2024 **Why:** To clean and aggregate election returns data for years 1824 to 1968 from ICPSR 1, United States Historical Election Returns

In [204]:
import os
import os.path as path
import pandas as pd
import numpy as np
import re


In [205]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
intermed_data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(intermed_data_dir), "Data directory does not exist"
raw_data_dir = "./2_data/1_raw/political_data"
assert os.path.exists(raw_data_dir), "Data directory does not exist"

In [206]:
df = pd.read_csv(path.join(raw_data_dir, "./ICPSR_election_returns/DS0001/00001-0001-Data.csv"))

In [207]:
# df2 = pd.read_csv(path.join(raw_data_dir, "./ICPSR_election_returns/DS0002/00001-0002-Data.csv"))

In [208]:
# df5 = pd.read_csv(path.join(raw_data_dir, "./ICPSR_election_returns/DS0005/00001-0005-Data.csv"))

In [209]:
df5['ICPR_STATE_CODE'].head()

0    2
1    2
2    2
3    2
4    2
Name: ICPR_STATE_CODE, dtype: int64

In [210]:
# Step 1: Get all column names from df
column_names = df.columns
print(len(column_names))

# Step 2: Filter columns that start with 'X' followed by three digits (X###)
pattern = re.compile(r"^X\d{3}_")
filtered_columns = [col for col in column_names if pattern.match(col)]
print(len(filtered_columns))
print(len(column_names) - len(filtered_columns))

# Step 3: Remove the 'X###_' prefix
cleaned_columns = {re.sub(r"^X\d{3}_", "", col) for col in filtered_columns}

# Step 4: Count unique column names after removing the prefix
num_unique_columns = len(cleaned_columns)

print(f"Total number of columns after 1st melt should be {num_unique_columns + (len(column_names) - len(filtered_columns))}")

499
486
13
Total number of columns after 1st melt should be 128


In [211]:
cleaned_columns

{'1_G_PRES_0020_VOTE',
 '1_G_PRES_0025_VOTE',
 '1_G_PRES_0026_VOTE',
 '1_G_PRES_0029_VOTE',
 '1_G_PRES_0037_VOTE',
 '1_G_PRES_0100_VOTE',
 '1_G_PRES_0101_VOTE',
 '1_G_PRES_0200_VOTE',
 '1_G_PRES_0300_VOTE',
 '1_G_PRES_0310_VOTE',
 '1_G_PRES_0526_VOTE',
 '1_G_PRES_0604_VOTE',
 '1_G_PRES_0611_VOTE',
 '1_G_PRES_0728_VOTE',
 '1_G_PRES_9001_VOTE',
 '1_G_PRES_9999_VOTE',
 '1_G_PRES_TOTAL_VOTE',
 '2_G_GOV_0001_VOTE',
 '2_G_GOV_0012_VOTE',
 '2_G_GOV_0025_VOTE',
 '2_G_GOV_0026_VOTE',
 '2_G_GOV_0026_VOTE.1',
 '2_G_GOV_0029_VOTE',
 '2_G_GOV_0100_VOTE',
 '2_G_GOV_0200_VOTE',
 '2_G_GOV_0300_VOTE',
 '2_G_GOV_0310_VOTE',
 '2_G_GOV_0526_VOTE',
 '2_G_GOV_0605_VOTE',
 '2_G_GOV_0659_VOTE',
 '2_G_GOV_0659_VOTE.1',
 '2_G_GOV_0659_VOTE.2',
 '2_G_GOV_0728_VOTE',
 '2_G_GOV_1063_VOTE',
 '2_G_GOV_1195_VOTE',
 '2_G_GOV_9001_VOTE',
 '2_G_GOV_9999_VOTE',
 '2_G_GOV_TOTAL_VOTE',
 '3_G_CONG_0029_VOTE',
 '3_G_CONG_0037_VOTE',
 '3_G_CONG_0100_VOTE',
 '3_G_CONG_0200_VOTE',
 '3_G_CONG_0300_VOTE',
 '3_G_CONG_0310_VOTE',
 

In [212]:
df.head()

Unnamed: 0,ICPR_STATE_CODE,COUNTY_NAME,IDENTIFICATION_NUMBER,CONG_DIST_NUMBER_1825,CONG_DIST_NUMBER_1829,CONG_DIST_NUMBER_1833,CONG_DIST_NUMBER_1835,CONG_DIST_NUMBER_1837,CONG_DIST_NUMBER_1841,CONG_DIST_NUMBER_1845,...,X860_1_G_PRES_0604_VOTE,X860_1_G_PRES_9001_VOTE,X860_1_G_PRES_TOTAL_VOTE,X860_2_G_GOV_0100_VOTE,X860_2_G_GOV_0200_VOTE,X860_2_G_GOV_0605_VOTE,X860_2_G_GOV_0728_VOTE,X860_2_G_GOV_1195_VOTE,X860_2_G_GOV_9999_VOTE,X860_2_G_GOV_TOTAL_VOTE
0,1,FAIRFIELD,10,98,98,98,98,98,4,4,...,2033,0,10454,7136,6921,0,0,0,0,14057
1,1,HARTFORD,30,98,98,98,98,98,1,1,...,3088,0,15156,8975,8753,0,0,0,0,17728
2,1,LITCHFIELD,50,98,98,98,98,98,5,4,...,1567,0,8150,4656,5203,0,0,0,0,9859
3,1,MIDDLESEX,70,98,98,98,98,98,2,2,...,1335,0,5510,3490,2942,0,0,0,0,6432
4,1,NEW HAVEN,90,98,98,98,98,98,2,2,...,4368,0,16540,9765,8709,0,0,0,0,18474


# Deciphering variable names

**1.** Since this dataset is provided in ASCII format with a SAS or SPSS setup files, I have extracted all the dataset into a csv format using a very niche R library called asciiSetupReader written specifically for extraction of pre-2000s dataset formatted in this weird way. As variable names in CSV, I used labels defined in the setup file. You can find this file in our StateLaws Dropbox: the path to the file is 1_code/similarity_code/Political_similarity_code/ICSPR_00001_to_csv.R

**2.** "Scope of Project" documentation for the study that can be found here: https://www.icpsr.umich.edu/web/ICPSR/studies/1. According to it "There is no actual codebook for this collection. Variable information is contained in the setup files." Thus, here I am making a codebook for naming conventions in my file so that if I or anyone else ever needs to go to the raw data, they would not have to spend hours figuring out what variable in the raw data mean. 

# Codebook for ICPSR 1, United States Historical Election Returns

## State and County Identifiers
| **Column Name**         | **Description**                                                                                     |
|-------------------------|-----------------------------------------------------------------------------------------------------|
| `ICPR_STATE_CODE`       | ICPSR standardized state code.                                                                      |
| `COUNTY_NAME`           | Standardized county name.                                                                           |
| `IDENTIFICATION_NUMBER` | Unique numeric identifier for each county, enabling consistent referencing.                         |

## Congressional District Numbers
| **Column Name**           | **Description**                                                                                   |
|---------------------------|---------------------------------------------------------------------------------------------------|
| `CONG_DIST_NUMBER_YYYY`   | Congressional district number for a specific year (e.g., `CONG_DIST_NUMBER_1825`). May indicate the number of districts for split counties. |

## Election Results

### General Format

X###_##_TYPE_RACE_PARTYCODE_VOTE

### Components
| **Component**     | **Description**                                                                                           |
|-------------------|---------------------------------------------------------------------------------------------------------|
| `X###`           | Election year (e.g., `X824` = 1824).                                                                      |
| `##`             | Election type/level: <br> **1** = Presidential, **2** = Gubernatorial, **3** = Congressional/House elections. |
| `TYPE`           | Type of election: <br> **G** = General, **M** = Midterm, **S** = Special.                                 |
| `RACE`           | Race type: <br> Examples: `PRES` = President, `GOV` = Governor.                                           |
| `PARTYCODE`      | Code representing the political party. See the attached party codes file for definitions (e.g., `0025` = National Republican). |
| `VOTE`           | Number of votes received by the candidate.                                                                |
| `TOTAL_VOTE`     | Total votes cast for the specific race or election.                                                       |

### Examples
| **Column Name**               | **Description**                                                                             |
|-------------------------------|---------------------------------------------------------------------------------------------|
| `X824_1_G_PRES_0025_VOTE`     | Votes for the National Republican candidate in the 1824 presidential general election.      |
| `X825_2_G_GOV_0659_VOTE`      | Votes for a specific party in the 1825 gubernatorial general election.                      |
| `X827_3_M_H_AL_9001_VOTE`     | Votes in a midterm House election in district `9001` for Alabama in 1827.                   |
| `X836_2_G_GOV_TOTAL_VOTE`     | Total gubernatorial votes in the 1836 general election.                                     |

## Handling Duplicate or Corrected Entries
| **Column Name Example**       | **Description**                                                                             |
|-------------------------------|---------------------------------------------------------------------------------------------|
| `X825_2_G_GOV_0659_VOTE.1`    | A vote for a second candidate from '0659' party in 1825 gubernatorial election.|
| `X831_3_M_H_AL_0100_VOTE.2`   | A duplicate or re-evaluated entry for midterm House votes in district `0100` for Alabama in 1831. |

## Important Notes
- **Corrections:** Some entries, such as Jackson County in Georgia (`ID: 1510`), should be corrected to `1570` when analyzing by county.
- **Missing Values:** For counties not reporting data or not participating in elections, identifiers like `98` (placeholders) are used.
- **Party Codes:** Refer to the party codes section of the documentation contained in /Users/revekkagershovich/Dropbox (MIT)/StateLaws/2_data/1_raw/political_data/ICPSR_election_returns/DS0204/00001-0204-Documentation.txt for the specific meaning of codes like `0025`, `0659`, etc. which represent political parties.

In [213]:
df.head(20)

Unnamed: 0,ICPR_STATE_CODE,COUNTY_NAME,IDENTIFICATION_NUMBER,CONG_DIST_NUMBER_1825,CONG_DIST_NUMBER_1829,CONG_DIST_NUMBER_1833,CONG_DIST_NUMBER_1835,CONG_DIST_NUMBER_1837,CONG_DIST_NUMBER_1841,CONG_DIST_NUMBER_1845,...,X860_1_G_PRES_0604_VOTE,X860_1_G_PRES_9001_VOTE,X860_1_G_PRES_TOTAL_VOTE,X860_2_G_GOV_0100_VOTE,X860_2_G_GOV_0200_VOTE,X860_2_G_GOV_0605_VOTE,X860_2_G_GOV_0728_VOTE,X860_2_G_GOV_1195_VOTE,X860_2_G_GOV_9999_VOTE,X860_2_G_GOV_TOTAL_VOTE
0,1,FAIRFIELD,10,98,98,98,98,98,4,4,...,2033,0,10454,7136,6921,0,0,0,0,14057
1,1,HARTFORD,30,98,98,98,98,98,1,1,...,3088,0,15156,8975,8753,0,0,0,0,17728
2,1,LITCHFIELD,50,98,98,98,98,98,5,4,...,1567,0,8150,4656,5203,0,0,0,0,9859
3,1,MIDDLESEX,70,98,98,98,98,98,2,2,...,1335,0,5510,3490,2942,0,0,0,0,6432
4,1,NEW HAVEN,90,98,98,98,98,98,2,2,...,4368,0,16540,9765,8709,0,0,0,0,18474
5,1,NEW LONDON,110,98,98,98,98,98,3,3,...,1199,0,9481,5102,5672,0,0,0,0,10774
6,1,TOLLAND,130,98,98,98,98,98,6,1,...,479,0,4130,2210,2558,0,0,0,0,4768
7,1,WINDHAM,150,98,98,98,98,98,6,3,...,303,0,5398,2586,3700,0,0,0,0,6286


# Finding out what .1 .2 and .3 mean

# 1st Melt To Make Year a Separate Variable

In [214]:
df.drop(columns=[col for col in df.columns if col.startswith('CONG')], inplace=True)

In [215]:
# Step 1: Identify columns and group them by their suffix
# Add all variables starting with "CONG" to id_vars
id_vars = ['ICPR_STATE_CODE', 'COUNTY_NAME', 'IDENTIFICATION_NUMBER']
grouped_columns = {}

# Group columns by their suffix (everything after the first underscore and without the year part)
for col in df.columns:
    if col.startswith('X'):
        suffix = '_'.join(col.split('_')[1:])  # Extract the suffix after the first underscore
        if suffix not in grouped_columns:
            grouped_columns[suffix] = []
        grouped_columns[suffix].append(col)

In [216]:
print(grouped_columns.keys())

dict_keys(['1_G_PRES_0020_VOTE', '1_G_PRES_0611_VOTE', '1_G_PRES_9999_VOTE', '1_G_PRES_TOTAL_VOTE', '2_G_GOV_0012_VOTE', '2_G_GOV_0200_VOTE', '2_G_GOV_1063_VOTE', '2_G_GOV_TOTAL_VOTE', '2_G_GOV_0001_VOTE', '2_G_GOV_0659_VOTE', '2_G_GOV_0659_VOTE.1', '2_G_GOV_0659_VOTE.2', '3_M_H_AL_9001_VOTE', '3_M_H_AL_9002_VOTE', '3_M_H_AL_9003_VOTE', '3_M_H_AL_9004_VOTE', '3_M_H_AL_9005_VOTE', '3_M_H_AL_9006_VOTE', '3_M_H_AL_9007_VOTE', '3_M_H_AL_9008_VOTE', '3_M_H_AL_9009_VOTE', '3_M_H_AL_9010_VOTE', '3_M_H_AL_9011_VOTE', '3_M_H_AL_9012_VOTE', '3_M_H_AL_9013_VOTE', '3_M_H_AL_9014_VOTE', '3_M_H_AL_9015_VOTE', '3_M_H_AL_9016_VOTE', '3_M_H_AL_9017_VOTE', '3_M_H_AL_9018_VOTE', '3_M_H_AL_9019_VOTE', '3_M_H_AL_2020_VOTE', '3_M_H_AL_TOTAL_VOTE', '1_G_PRES_0025_VOTE', '1_G_PRES_0101_VOTE', '2_G_GOV_0025_VOTE', '3_M_H_AL_9020_VOTE', '3_M_H_AL_9021_VOTE', '3_M_H_AL_9022_VOTE', '2_G_GOV_0026_VOTE', '2_G_GOV_0026_VOTE.1', '2_G_GOV_0100_VOTE', '2_G_GOV_9001_VOTE', '3_M_H_AL_0025_VOTE', '3_M_H_AL_0025_VOTE.1', '

In [217]:
# Step 2: Reshape each group and combine into a single table
reshaped_dataframes = []

for suffix, cols in grouped_columns.items():
    # Reshape the group into long format
    temp_df = pd.melt(df, id_vars=id_vars, value_vars=cols,
                      var_name='year', value_name=suffix)
    # Extract the year and adjust to full year format
    temp_df['year'] = temp_df['year'].str.extract(r'X(\d+)').astype(int) + 1000
    reshaped_dataframes.append(temp_df)

In [218]:
# Step 3: Merge all reshaped groups into a single DataFrame
final_df = reshaped_dataframes[0]
for additional_df in reshaped_dataframes[1:]:
    final_df = final_df.merge(additional_df, on=id_vars + ['year'], how='outer')

final_df = final_df[['year'] + [col for col in final_df.columns if col != 'year']]

In [219]:
final_df.columns

Index(['year', 'ICPR_STATE_CODE', 'COUNTY_NAME', 'IDENTIFICATION_NUMBER',
       '1_G_PRES_0020_VOTE', '1_G_PRES_0611_VOTE', '1_G_PRES_9999_VOTE',
       '1_G_PRES_TOTAL_VOTE', '2_G_GOV_0012_VOTE', '2_G_GOV_0200_VOTE',
       ...
       '3_G_CONG_0200_VOTE', '3_G_CONG_0310_VOTE', '1_G_PRES_0200_VOTE',
       '1_G_PRES_0310_VOTE', '3_G_CONG_0037_VOTE', '3_G_CONG_0604_VOTE',
       '1_G_PRES_0037_VOTE', '1_G_PRES_0604_VOTE', '2_G_GOV_0605_VOTE',
       '2_G_GOV_1195_VOTE'],
      dtype='object', length=119)

In [220]:
temp = final_df[final_df['2_G_GOV_0026_VOTE.1'].notna()]

print(temp['year'].unique())

[1831]


In [221]:
temp = final_df[final_df['2_G_GOV_0659_VOTE.1'].notna()]

print(temp['year'].unique())

[1825 1826 1827]


In [222]:
# Rename TOTAL_VOTE columns to match the party code format ('0000' instead of a party code)
renamed_columns = {col: col.replace("TOTAL_VOTE", "0000_VOTE") for col in final_df.columns if "TOTAL_VOTE" in col}
final_df = final_df.rename(columns=renamed_columns)

# Multiple Candidates From A Single Party
Dealing with situation where multiple candidate exists in single elections for a single party, i.e. there are 3 Opposition Republicans running for 1825 CT elections. 

1. I compared the election of 1825 in CT to Wikipedia data and discovered that one of the parties had 3 candidates, and in the data this party had .1 and .2 prefixes. 



In [223]:
# Exploring why some variables have version with suffixes .1, .2, etc. in the dataset
# Step 1: Filter the dataset for the year 1825
elec_1825 = final_df[final_df['year'] == 1825]

# Step 2: Select columns that include 'COUNTY_NAME' and those starting with '2_G_GOV'
selected_columns = ['COUNTY_NAME'] + [col for col in final_df.columns if col.startswith("2_G_GOV")]

# Step 3: Keep only the selected columns
elec_1825 = elec_1825[selected_columns]

# Step 4: Drop columns that contain only NA values
elec_1825 = elec_1825.dropna(axis=1, how='all')

# Display the resulting dataframe
print(elec_1825)

#0012  OLD REPUBLICAN 
#0001 FEDERALIST
#0659 Opposition Republicans

total_votes = elec_1825["2_G_GOV_0000_VOTE"].sum()
print(f"Total votes for 2_G_GOV_0000_VOTE: {total_votes}")

    COUNTY_NAME  2_G_GOV_0012_VOTE  2_G_GOV_0000_VOTE  2_G_GOV_0001_VOTE  \
1     FAIRFIELD              982.0               1502              257.0   
38     HARTFORD             1113.0               1807              104.0   
75   LITCHFIELD             1172.0               1610              246.0   
112   MIDDLESEX              533.0                761              197.0   
149   NEW HAVEN             1136.0               1552              354.0   
186  NEW LONDON              993.0               1290               52.0   
223     TOLLAND              525.0                716               36.0   
260     WINDHAM              693.0                957               96.0   

     2_G_GOV_0659_VOTE  2_G_GOV_0659_VOTE.1  2_G_GOV_0659_VOTE.2  
1                230.0                 33.0                  0.0  
38               176.0                 69.0                345.0  
75                52.0                 14.0                126.0  
112               12.0                  4.0    

In [224]:
# Step 1: Identify columns with suffixes .1, .2, .3, etc.
suffix_pattern = re.compile(r"(.*)\.(\d+)$")  # Matches columns ending in .1, .2, etc.
grouped_columns = {}

for col in final_df.columns:
    match = suffix_pattern.match(col)
    if match:
        base_name = match.group(1)  # Extract base column name (without suffix)
        if base_name not in grouped_columns:
            grouped_columns[base_name] = []
        grouped_columns[base_name].append(col)

# Step 2: Identify related columns and rename base columns with .0 postfix
for base_name in grouped_columns.keys():
    if base_name in final_df.columns:  # If the original base column exists
        final_df.rename(columns={base_name: base_name + ".0"}, inplace=True)
        grouped_columns[base_name].append(base_name + ".0")  # Include renamed base column

# Step 3: Create new summed columns
for base_name, related_columns in grouped_columns.items():
    final_df[base_name] = final_df[related_columns].sum(axis=1)

# Step 4: Drop all columns with suffixes .0, .1, .2, etc.
columns_to_drop = [col for col in final_df.columns if re.search(r"\.\d+$", col)]
final_df.drop(columns=columns_to_drop, inplace=True)

In [225]:
list(final_df.columns)

['year',
 'ICPR_STATE_CODE',
 'COUNTY_NAME',
 'IDENTIFICATION_NUMBER',
 '1_G_PRES_0020_VOTE',
 '1_G_PRES_0611_VOTE',
 '1_G_PRES_9999_VOTE',
 '1_G_PRES_0000_VOTE',
 '2_G_GOV_0012_VOTE',
 '2_G_GOV_0200_VOTE',
 '2_G_GOV_1063_VOTE',
 '2_G_GOV_0000_VOTE',
 '2_G_GOV_0001_VOTE',
 '3_M_H_AL_9001_VOTE',
 '3_M_H_AL_9002_VOTE',
 '3_M_H_AL_9003_VOTE',
 '3_M_H_AL_9004_VOTE',
 '3_M_H_AL_9005_VOTE',
 '3_M_H_AL_9006_VOTE',
 '3_M_H_AL_9007_VOTE',
 '3_M_H_AL_9008_VOTE',
 '3_M_H_AL_9009_VOTE',
 '3_M_H_AL_9010_VOTE',
 '3_M_H_AL_9011_VOTE',
 '3_M_H_AL_9012_VOTE',
 '3_M_H_AL_9013_VOTE',
 '3_M_H_AL_9014_VOTE',
 '3_M_H_AL_9015_VOTE',
 '3_M_H_AL_9016_VOTE',
 '3_M_H_AL_9017_VOTE',
 '3_M_H_AL_9018_VOTE',
 '3_M_H_AL_9019_VOTE',
 '3_M_H_AL_2020_VOTE',
 '3_M_H_AL_0000_VOTE',
 '1_G_PRES_0025_VOTE',
 '1_G_PRES_0101_VOTE',
 '2_G_GOV_0025_VOTE',
 '3_M_H_AL_9020_VOTE',
 '3_M_H_AL_9021_VOTE',
 '3_M_H_AL_9022_VOTE',
 '2_G_GOV_0100_VOTE',
 '2_G_GOV_9001_VOTE',
 '1_G_PRES_0026_VOTE',
 '1_G_PRES_0100_VOTE',
 '2_G_GOV_9999_VO

# Melt 2
## Removing party from the name into a separate variable

In [226]:
# Step 1: Identify columns and group them by their base name (e.g., `2_G_GOV_VOTE`)
id_vars = ['ICPR_STATE_CODE', 'COUNTY_NAME', 'IDENTIFICATION_NUMBER', 'year'] + [col for col in df.columns if col.startswith('CONG')]
grouped_columns = {}

# Group columns by removing the numeric segment before "_VOTE" (the second-to-last segment)
for col in final_df.columns:
    if '_VOTE' in col:  # Ensure we're only processing relevant columns
        parts = col.split('_')
        base_name = '_'.join(parts[:-2] + ['VOTE']) if parts[-2].isdigit() else col  # Remove numeric part before "VOTE"
        
        if base_name not in grouped_columns:
            grouped_columns[base_name] = []
        grouped_columns[base_name].append(col)

In [227]:
print(grouped_columns.keys())

dict_keys(['1_G_PRES_VOTE', '2_G_GOV_VOTE', '3_M_H_AL_VOTE', '3_W_H_AL_VOTE', '3_S_H_AL_VOTE', '3_G_CONG_VOTE'])


In [228]:
# Step 2: Reshape each group and combine into a single table
reshaped_dataframes = []

for base_name, cols in grouped_columns.items():
    # Reshape the group into long format
    temp_df = pd.melt(final_df, id_vars=id_vars, value_vars=cols,
                      var_name='party', value_name=base_name)


    # Extract the 4-digit party code from column names
    extracted_party = temp_df['party'].str.extract(r'_(\d{4})_')
    temp_df['party'] = extracted_party[0]  # Get first column from extracted DataFrame

    reshaped_dataframes.append(temp_df)

In [229]:
# Step 3: Merge all reshaped groups into a single DataFrame
final_df_long = reshaped_dataframes[0]
for additional_df in reshaped_dataframes[1:]:
    final_df_long = final_df_long.merge(additional_df, on=id_vars + ['party'], how='outer')

final_df_long = final_df_long[['party'] + [col for col in final_df_long.columns if col != 'party']]

Gusi

In [230]:
final_df_long['party'].unique()

array(['0000', '0001', '0012', '0020', '0025', '0026', '0029', '0037',
       '0100', '0101', '0103', '0200', '0300', '0310', '0526', '0604',
       '0605', '0611', '0659', '0728', '1063', '1195', '2020', '9001',
       '9002', '9003', '9004', '9005', '9006', '9007', '9008', '9009',
       '9010', '9011', '9012', '9013', '9014', '9015', '9016', '9017',
       '9018', '9019', '9020', '9021', '9022', '9999'], dtype=object)

In [231]:
final_df_long['year'].unique()

array([1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1833, 1834,
       1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1845,
       1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856,
       1857, 1858, 1859, 1860])

In [232]:
final_df_long.pivot_table(index='COUNTY_NAME', columns='year', aggfunc='size', fill_value=0)

year,1824,1825,1826,1827,1828,1829,1830,1831,1832,1833,...,1851,1852,1853,1854,1855,1856,1857,1858,1859,1860
COUNTY_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FAIRFIELD,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46,46
HARTFORD,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46,46
LITCHFIELD,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46,46
MIDDLESEX,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46,46
NEW HAVEN,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46,46
NEW LONDON,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46,46
TOLLAND,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46,46
WINDHAM,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46,46


In [233]:
final_df_long.columns

Index(['party', 'ICPR_STATE_CODE', 'COUNTY_NAME', 'IDENTIFICATION_NUMBER',
       'year', '1_G_PRES_VOTE', '2_G_GOV_VOTE', '3_M_H_AL_VOTE',
       '3_W_H_AL_VOTE', '3_S_H_AL_VOTE', '3_G_CONG_VOTE'],
      dtype='object')

In [None]:
final_df_long.rename(columns={'ICPR_STATE_CODE': 'ICPSR_state_code', 'COUNTY_NAME': 'county_name', 
                        'IDENTIFICATION_NUMBER': 'county_id', '1_G_PRES_VOTE':'president_vote', '2_G_GOV_VOTE':'governor_vote', '3_M_H_AL_VOTE': 'midterm_house_vote', '3_W_H_AL_VOTE':'w_house_vote', '3_S_H_AL_VOTE': 'special_house_vote', '3_G_CONG_VOTE': 'general_congress_vote'}, inplace=True)

final_df_long['county_name'] = cong_df['county_name'].str.title()

In [None]:
# Create ICPSR to FIPS and ICPSR to State Name mappings
icpsr_to_fips = {
    1: 9,  2: 23, 3: 25, 4: 33, 5: 44, 6: 50, 11: 10, 12: 34, 13: 36, 14: 42, 21: 17,
    22: 18, 23: 26, 24: 39, 31: 19, 32: 20, 33: 27, 34: 29, 35: 31, 36: 38, 37: 46,
    40: 51, 41: 1, 42: 5, 43: 12, 44: 13, 45: 22, 46: 28, 47: 37, 48: 45, 49: 48,
    51: 21, 52: 24, 53: 40, 54: 47, 56: 54, 49: 48, 72: 41, 73: 53, 97: 97, 98: 11
}

# Add 'state_fips' column to cong_df based on 'ICPSR_state_code'
cong_df['state_fips'] = cong_df['ICPSR_state_code'].map(icpsr_to_fips)

In [None]:
cong_df.head()

Unnamed: 0,ICPSR_state_code,county_name,county_id,year,general_presidential_vote,general_gubernatorial_vote,midterm_house_vote,w_house_vote,special_house_vote,general_congress_vote,district_number,cong_year,state_fips
0,1,Fairfield,10,1824,1299.0,1143.0,,,,,98,1825,9
1,1,Fairfield,10,1824,,,,,,,98,1825,9
2,1,Fairfield,10,1824,,938.0,,,,,98,1825,9
3,1,Fairfield,10,1824,4.0,,,,,,98,1825,9
4,1,Fairfield,10,1824,,,0.0,,,,98,1825,9


## Next Steps

**1.** I should check that all the columns were melted correctly: it seems like some columns such as columns with different versions and such were dropped as well as AL columns. 

**2.** Start loading data in a loop for all states, and potentially concatinate it into one