In [91]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the datasets
house_data = pd.read_csv('data/1976-2022-house.csv')
finance_data = pd.read_csv('data/weball22.txt', delimiter='|', header=None)

# Rename finance data columns
finance_columns = ['CAND_ID', 'CAND_NAME', 'CAND_ICI', 'PTY_CD', 'CAND_PTY_AFFILIATION', 'TTL_RECEIPTS', 'TRANS_FROM_AUTH', 'TTL_DISB',
                   'TRANS_TO_AUTH', 'COH_BOP', 'COH_COP', 'CAND_CONTRIB', 'CAND_LOANS', 'OTHER_LOANS', 'CAND_LOAN_REPAY',
                   'OTHER_LOAN_REPAY', 'DEBTS_OWED_BY', 'TTL_INDIV_CONTRIB', 'CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT', 'SPEC_ELECTION',
                   'PRIM_ELECTION', 'RUN_ELECTION', 'GEN_ELECTION', 'GEN_ELECTION_PRECENT', 'OTHER_POL_CMTE_CONTRIB', 'POL_PTY_CONTRIB',
                   'CVG_END_DT', 'INDIV_REFUNDS', 'CMTE_REFUNDS']
useful_columns = ['CAND_ID', 'CAND_NAME', 'CAND_ICI', 'CAND_PTY_AFFILIATION', 'TTL_DISB', 'CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT']

finance_data.columns = finance_columns

# Filter finance data for House candidates in 2022
finance_data_2022 = finance_data[(finance_data['CAND_OFFICE_ST'].notna()) & (finance_data['CVG_END_DT'].str[-4:] == '2022')]

def handle_numeric_conversion(df, column, file_name):
    # Convert column to numeric, keeping NaNs
    df.loc[:, column] = pd.to_numeric(df[column], errors='coerce')

    # Save rows with NaN values to CSV for error inspection
    error_rows = df[df[column].isna()]
    error_file = f'error/nan_{column}_rows_{file_name}.csv'
    error_rows.to_csv(error_file, index=False)

    # Log the number of rows removed
    num_removed = len(error_rows)
    print(f"Number of rows removed due to NaN values in {column}: {num_removed}")

    # Drop rows with NaN values
    df = df.dropna(subset=[column])

    # Convert to float instead of int to handle potential decimal values
    # Suppress the SettingWithCopyWarning using a pandas option
    with pd.option_context('mode.chained_assignment', None):
        df[column] = df[column].astype(int)

    return df

# Apply the function to CAND_OFFICE_DISTRICT
finance_data_2022_clean = handle_numeric_conversion(finance_data_2022, 'CAND_OFFICE_DISTRICT', 'weball22')[useful_columns]

# Convert CAND_NAME format
def convert_name(name):
    parts = name.split(',')
    if len(parts) == 2:
        return f"{parts[1].strip()} {parts[0].strip()}"
    return name  # Return original name if it doesn't match the expected format

finance_data_2022_clean['CAND_NAME'] = finance_data_2022_clean['CAND_NAME'].apply(convert_name)

Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 5


In [92]:
finance_data_2022_clean

Unnamed: 0,CAND_ID,CAND_NAME,CAND_ICI,CAND_PTY_AFFILIATION,TTL_DISB,CAND_OFFICE_ST,CAND_OFFICE_DISTRICT
0,H2AK00200,CHRISTOPHER CONSTANT,C,DEM,164637.90,AK,0
1,H2AK01158,MARY PELTOLA,I,DEM,7060033.09,AK,0
2,H2AK01240,ADAM L WOOL,O,DEM,16217.07,AK,0
3,H2AK00218,JOSHUA CARL REVAK,O,REP,121841.00,AK,0
4,H2AK00226,SARAH PALIN,O,REP,1924781.35,AK,0
...,...,...,...,...,...,...,...
4150,S8WV00127,EVAN H JENKINS,C,REP,0.00,WV,0
4151,S0WY00129,YANA LUDWIG,O,DEM,596.96,WY,0
4153,S0WY00137,CYNTHIA MARIE MRS. LUMMIS,I,REP,417700.81,WY,0
4154,S4WY00147,BRYAN MILLER,O,REP,0.00,WY,0


In [93]:
finance_data_2022_clean['CAND_NAME'].to_csv('output/finance_data_2022_names.csv', index=False)


# house data

In [84]:
house_data_2022 = house_data[(house_data['year'] == 2022) & (house_data['writein'] == False) & (house_data['unofficial'] == False) & (house_data['mode'] == 'TOTAL') & (house_data['stage'] == 'GEN')]
drop_columns = ['state_cen', 'state_ic', 'version', 'fusion_ticket', 'writein', 'unofficial', 'mode', 'stage', 'state_fips']
house_data_2022 = house_data_2022.drop(columns=drop_columns)

In [87]:
house_data_2022['candidate'].to_csv('output/house_data_2022_names.csv', index=False)

In [96]:
house_data_2022

Unnamed: 0,year,state,state_po,office,district,runoff,special,candidate,party,candidatevotes,totalvotes
31103,2022,ALABAMA,AL,US HOUSE,1,False,False,JERRY L CARL,REPUBLICAN,140592,168150
31104,2022,ALABAMA,AL,US HOUSE,1,False,False,ALEXANDER M REMREY,LIBERTARIAN,26369,168150
31106,2022,ALABAMA,AL,US HOUSE,2,False,False,BARRY MOORE,REPUBLICAN,137460,198961
31107,2022,ALABAMA,AL,US HOUSE,2,False,False,PHYLLIS HARVEY-HALL,DEMOCRAT,58014,198961
31108,2022,ALABAMA,AL,US HOUSE,2,False,False,JONATHAN REALZ,LIBERTARIAN,3396,198961
...,...,...,...,...,...,...,...,...,...,...,...
32446,2022,WYOMING,WY,US HOUSE,0,False,False,LYNNETTE GREYBULL,DEMOCRAT,47250,198198
32447,2022,WYOMING,WY,US HOUSE,0,False,False,RICHARD BRUBAKER,LIBERTARIAN,5420,198198
32448,2022,WYOMING,WY,US HOUSE,0,False,False,MARISSA JOY SELVIG,CONSTITUTION,4505,198198
32450,2022,WYOMING,WY,US HOUSE,0,False,False,UNDERVOTES,,3660,198198


In [95]:
finance_data_2022_clean.columns

Index(['CAND_ID', 'CAND_NAME', 'CAND_ICI', 'CAND_PTY_AFFILIATION', 'TTL_DISB',
       'CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT'],
      dtype='object')

In [99]:
from fuzzywuzzy import fuzz
import numpy as np

# Function to find the best match between two name lists
def find_best_match(name, candidate_list):
    best_match = None
    best_ratio = 0
    for candidate in candidate_list:
        ratio = fuzz.token_sort_ratio(name, candidate)
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = candidate
    return best_match if best_ratio > 70 else None  # Return None if no good match found

# Prepare the datasets
finance_data = finance_data_2022_clean.copy()
house_data = house_data_2022.copy()

# Ensure district is of the same type in both dataframes
finance_data['CAND_OFFICE_DISTRICT'] = finance_data['CAND_OFFICE_DISTRICT'].astype(str)
house_data['district'] = house_data['district'].astype(str)

# Group finance data by state and district
finance_grouped = finance_data.groupby(['CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT'])

# Initialize an empty list to store merged data
merged_data = []

# Iterate through each district in house_data
for (state, district), house_group in house_data.groupby(['state_po', 'district']):
    finance_group = finance_grouped.get_group((state, district)) if (state, district) in finance_grouped.groups else None
    
    if finance_group is not None:
        house_candidates = house_group['candidate'].tolist()
        finance_candidates = finance_group['CAND_NAME'].tolist()
        
        for _, house_row in house_group.iterrows():
            best_match = find_best_match(house_row['candidate'], finance_candidates)
            if best_match:
                finance_row = finance_group[finance_group['CAND_NAME'] == best_match].iloc[0]
                merged_row = house_row.to_dict()
                merged_row.update(finance_row.to_dict())
                merged_data.append(merged_row)
            # If there's no match, append the row with only house data
            else:
                merged_row = house_row.to_dict()
                merged_data.append(merged_row)

# Create the final merged dataframe
merged_df = pd.DataFrame(merged_data)

# Save the merged dataset to a CSV file
merged_df.to_csv('output/merged_house_finance_data_2022.csv', index=False)


   year    state state_po    office district  runoff  special  \
0  2022   ALASKA       AK  US HOUSE        0   False    False   
1  2022   ALASKA       AK  US HOUSE        0   False    False   
2  2022   ALASKA       AK  US HOUSE        0   False    False   
3  2022   ALASKA       AK  US HOUSE        0   False    False   
4  2022  ALABAMA       AL  US HOUSE        1   False    False   

              candidate        party  candidatevotes  totalvotes    CAND_ID  \
0           SARAH PALIN   REPUBLICAN           67866      263610  H2AK00226   
1           NICK BEGICH   REPUBLICAN           61513      263610        NaN   
2  MARY SATTLER PELTOLA     DEMOCRAT          128553      263610  H2AK01158   
3             CHRIS BYE  LIBERTARIAN            4570      263610  H2AK01216   
4          JERRY L CARL   REPUBLICAN          140592      168150  H0AL01055   

             CAND_NAME CAND_ICI CAND_PTY_AFFILIATION    TTL_DISB  \
0          SARAH PALIN        O                  REP  1924781.35  

# plots and things

In [None]:
# Merge datasets
merged_data = pd.merge(house_data[house_data['year'] == 2022], 
                       finance_data_2022, 
                       left_on=['state_po', 'district'], 
                       right_on=['CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT'],
                       how='left')

# Calculate total votes and spending per district
district_summary = merged_data.groupby(['state_po', 'district']).agg({
    'candidatevotes': 'sum',
    'TTL_RECEIPTS': 'sum'
}).reset_index()

district_summary['spending_per_vote'] = district_summary['TTL_RECEIPTS'] / district_summary['candidatevotes']

In [16]:
# Plot spending per vote vs total votes
plt.figure(figsize=(12, 8))
plt.scatter(district_summary['candidatevotes'], district_summary['spending_per_vote'], alpha=0.5)
plt.xlabel('Total Votes in District')
plt.ylabel('Spending per Vote ($)')
plt.title('Campaign Spending Efficiency in 2022 House Elections')
plt.xscale('log')
plt.yscale('log')
plt.grid(True)
plt.savefig('output/spending_vs_votes.png')
plt.close()

# Analyze party spending
party_spending = merged_data.groupby('party').agg({
    'TTL_RECEIPTS': 'sum',
    'candidatevotes': 'sum'
}).reset_index()

party_spending['spending_per_vote'] = party_spending['TTL_RECEIPTS'] / party_spending['candidatevotes']

print("Party Spending Analysis:")
print(party_spending)

# Analyze incumbent vs challenger performance
incumbent_performance = merged_data.groupby('CAND_ICI').agg({
    'TTL_RECEIPTS': 'mean',
    'candidatevotes': 'mean'
}).reset_index()

incumbent_performance['spending_per_vote'] = incumbent_performance['TTL_RECEIPTS'] / incumbent_performance['candidatevotes']

print("\nIncumbent vs Challenger Performance:")
print(incumbent_performance)

# Save results to CSV
district_summary.to_csv('output/district_summary_2022.csv', index=False)
party_spending.to_csv('output/party_spending_2022.csv', index=False)
incumbent_performance.to_csv('output/incumbent_performance_2022.csv', index=False)

Party Spending Analysis:
                           party  TTL_RECEIPTS  candidatevotes  \
0                       ALLIANCE  9.170004e+06           26340   
1    AMERICAN CONSTITUTION PARTY  5.234086e+06           67822   
2                AMERICAN VALUES  1.107344e+07            6180   
3                    CANNON FIRE  2.237513e+06            3852   
4          COLORADO CENTER PARTY  9.169601e+06           12494   
5                   CONSERVATIVE  1.544611e+08         2252730   
6                   CONSTITUTION  3.676512e+07          315879   
7                       DEMOCRAT  2.214453e+09       341250110   
8                     DEMOCRATIC  5.159211e+07           99010   
9        DEMOCRATIC-FARMER-LABOR  1.637539e+07         3597195   
10                FOR THE PEOPLE  2.237513e+06           20058   
11              GOD SAVE AMERICA  1.217803e+07            8928   
12  GRASSROOTS-LEGALIZE CANNABIS  6.674943e+06           74145   
13                         GREEN  9.537437e+06     