In [26]:
import pandas as pd

In [27]:
# Convert CAND_NAME format
def convert_name(name):
    parts = name.split(',')
    if len(parts) == 2:
        return f"{parts[1].strip()} {parts[0].strip()}"
    return name  # Return original name if it doesn't match the expected format

In [28]:
def handle_numeric_conversion(df, column, file_name):
    # Convert column to numeric, keeping NaNs
    df.loc[:, column] = pd.to_numeric(df[column], errors='coerce')

    # Save rows with NaN values to CSV for error inspection
    error_rows = df[df[column].isna()]
    error_file = f'error/nan_{column}_rows_{file_name}.csv'
    if not error_rows.empty:
        error_rows.to_csv(error_file, index=False)

    # Log the number of rows removed
    num_removed = len(error_rows)
    print(f"Number of rows removed due to NaN values in {column}: {num_removed}")

    # Drop rows with NaN values
    df[column] = df[column].fillna(-1)

    # Convert to float instead of int to handle potential decimal values
    # Suppress the SettingWithCopyWarning using a pandas option
    with pd.option_context('mode.chained_assignment', None):
        df[column] = df[column].astype(int)

    return df
    
datasets = {}

for year in range(4, 24, 2):
    year_str = f"{year:02d}"
    finance_data = pd.read_csv(f'weball{year_str}.txt', delimiter='|', header=None)

    # Rename finance data columns
    finance_columns = ['CAND_ID', 'CAND_NAME', 'CAND_ICI', 'PTY_CD', 'CAND_PTY_AFFILIATION', 'TTL_RECEIPTS', 'TRANS_FROM_AUTH', 'TTL_DISB',
                    'TRANS_TO_AUTH', 'COH_BOP', 'COH_COP', 'CAND_CONTRIB', 'CAND_LOANS', 'OTHER_LOANS', 'CAND_LOAN_REPAY',
                    'OTHER_LOAN_REPAY', 'DEBTS_OWED_BY', 'TTL_INDIV_CONTRIB', 'CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT', 'SPEC_ELECTION',
                    'PRIM_ELECTION', 'RUN_ELECTION', 'GEN_ELECTION', 'GEN_ELECTION_PRECENT', 'OTHER_POL_CMTE_CONTRIB', 'POL_PTY_CONTRIB',
                    'CVG_END_DT', 'INDIV_REFUNDS', 'CMTE_REFUNDS']

    finance_data.columns = finance_columns

    finance_data['year'] = "20" + year_str

    # Filter finance data for House candidates in 2022
    finance_data = finance_data[(finance_data['CAND_OFFICE_ST'].notna()) & (finance_data['CVG_END_DT'].str[-4:] == "20" + year_str)]

    # Apply the function to CAND_OFFICE_DISTRICT
    finance_data_clean = handle_numeric_conversion(finance_data, 'CAND_OFFICE_DISTRICT', f'weball{year_str}')

    finance_data_clean['CAND_NAME'] = finance_data_clean['CAND_NAME'].apply(convert_name)
    datasets[year] = finance_data_clean

combined_data = pd.concat(datasets.values(), ignore_index=True)

# Save the combined data to a CSV file
combined_data.to_csv('../FEC_data.csv', index=False)


Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 0
Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 0
Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 0
Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 1
Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 1
Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 4
Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 3
Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 1
Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 0
Number of rows removed due to NaN values in CAND_OFFICE_DISTRICT: 5
