In [196]:
import pandas as pd
import json

In [197]:
fec_data = pd.read_csv('data/FEC_data.csv')
FEC_subset_columns = ['year', 'CAND_ID', 'CAND_NAME', 'CAND_ICI', 'CAND_PTY_AFFILIATION', 'TTL_DISB', 'CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT']

fec_data = fec_data[FEC_subset_columns]
fec_data.rename(columns={'CAND_ID': 'FEC_candidate_id'}, inplace=True)


In [198]:
house_data = pd.read_csv('data/1976-2022-house.csv')
drop_columns = ['state_cen', 'state_ic', 'version', 'fusion_ticket', 'writein', 'unofficial', 'mode', 'stage', 'state_fips', 'office']

house_data = house_data[(house_data['writein'] == False) & (house_data['unofficial'] == False) & (house_data['mode'] == 'TOTAL') & (house_data['stage'] == 'GEN')]

with pd.option_context('mode.chained_assignment', None):
    house_data.rename(columns={'candidate': 'candidate_name', 'candidatevotes': 'candidate_votes', 'totalvotes': 'total_votes'}, inplace=True)
    house_data.drop(columns=drop_columns, inplace=True)

# merging

In [199]:
from fuzzywuzzy import fuzz

# Function to find the best match between two name lists
def find_best_match(name, candidate_list):
    best_match = None
    best_ratio = 0
    for candidate in candidate_list:
        ratio = fuzz.token_sort_ratio(name, candidate)
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = candidate
    return best_match if best_ratio > 60 else None  # Return None if no good match found

In [200]:
pd.options.mode.chained_assignment = None
# Initialize an empty list to store all merged data
all_merged_data = []

for year in range(2010, 2022, 2):
    house_data_year = house_data[house_data['year'] == year]
    fec_data_year = fec_data[fec_data['year'] == year]

    # Ensure district is of the same type in both dataframes
    fec_data_year['CAND_OFFICE_DISTRICT'] = fec_data_year['CAND_OFFICE_DISTRICT'].astype(str)
    house_data_year['district'] = house_data_year['district'].astype(str)

    # Group finance data by state and district
    finance_grouped = fec_data_year.groupby(['CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT'])

    # Iterate through each district in house_data
    for (state, district), house_group in house_data_year.groupby(['state_po', 'district']):
        finance_group = finance_grouped.get_group((state, district)) if (state, district) in finance_grouped.groups else None
        
        if finance_group is not None:
            house_candidates = house_group['candidate_name'].tolist()
            finance_candidates = finance_group['CAND_NAME'].tolist()
            
            for _, house_row in house_group.iterrows():
                best_match = find_best_match(house_row['candidate_name'], finance_candidates)
                if best_match:
                    finance_row = finance_group[finance_group['CAND_NAME'] == best_match].iloc[0]
                    merged_row = house_row.to_dict()
                    merged_row.update(finance_row.to_dict())
                    all_merged_data.append(merged_row)
                # If there's no match, append the row with only house data
                else:
                    merged_row = house_row.to_dict()
                    all_merged_data.append(merged_row)

# Create the final merged dataframe with all years
merged_df = pd.DataFrame(all_merged_data)
merged_df.drop(columns=['CAND_NAME', 'CAND_PTY_AFFILIATION', 'CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT'], inplace=True)

merged_df['id'] = merged_df['year'].astype(str) + merged_df['state_po'] + merged_df['district']
merged_df = merged_df[['id'] + [col for col in merged_df.columns if col != 'id']]

# Save the merged dataset with all years to a single CSV file
merged_df.to_csv('data/cleaned/merged_house_finance_data_all_years.csv', index=False)

In [201]:
import json

# Load the JSON data from the file
with open('data/wiki/wiki_scrape_nhe.json', 'r') as file:
    nhe_data = json.load(file)

with open('data/wiki/wiki_scrape_pvi.json', 'r') as file:
    pvi_data = json.load(file)

In [202]:
for year, data in nhe_data.items():
    republican_percentage = 0
    democratic_percentage = 0
    
    for party_data in data:
        if party_data['party'] == 'Republican Party':
            republican_percentage = party_data['percentage']
        elif party_data['party'] == 'Democratic Party':
            democratic_percentage = party_data['percentage']
    percent_difference = republican_percentage - democratic_percentage
    
    # Update the 'NHE' column for the corresponding year
    merged_df.loc[merged_df['year'] == int(year), 'NHE'] = round(percent_difference, 2)
    
    # Print the year and the difference
    print(f"Year: {year}, Percent difference between Republican and Democratic votes: {percent_difference:.2f}%")

Year: 2022, Percent difference between Republican and Democratic votes: 2.72%
Year: 2020, Percent difference between Republican and Democratic votes: -3.10%
Year: 2018, Percent difference between Republican and Democratic votes: -8.60%
Year: 2016, Percent difference between Republican and Democratic votes: 1.10%
Year: 2014, Percent difference between Republican and Democratic votes: 5.70%
Year: 2012, Percent difference between Republican and Democratic votes: -1.10%
Year: 2010, Percent difference between Republican and Democratic votes: 6.80%
Year: 2008, Percent difference between Republican and Democratic votes: -10.60%


In [203]:
def convert_pvi(pvi):
    if 'R+' in pvi:
        return int(pvi.replace('R+', ''))
    elif 'D+' in pvi:
        return -int(pvi.replace('D+', ''))
    elif 'even' in pvi.lower():
        return 0
    else:
        print(f"PVI value not recognized: {pvi}")
        return None


In [206]:
district_name

'New Hampshire\xa01'

In [208]:
for year, state_data in pvi_data.items():
    for state, districts in state_data.items():
        for district_info in districts:
            district_name = district_info['district']
            pvi = district_info['pvi']
            
            # Extract district number
            if 'at-large' in district_name:
                district_number = 0
            elif " " in district_name:
                district_number = district_name.split(' ')[-1]
                if "\xa0" in district_name:
                    district_number = int(district_name.split('\xa0')[-1])
                else:
                    district_number = int(district_name.split(' ')[-1])
            else:
                district_number = int(district_name.split('\u00a0')[-1])
            
            # Update merged_df with PVI information
            mask = (
                (merged_df['year'] == int(year)) &
                (merged_df['state'] == state.upper()) &
                (merged_df['district'] == str(district_number))
            )
            converted_pvi = convert_pvi(pvi)
            merged_df.loc[mask, 'pvi'] = converted_pvi


In [210]:
merged_df = merged_df[merged_df['state'] != "DISTRICT OF COLUMBIA"]
merged_df.to_csv('data/cleaned/merged_house_finance_data_all_years.csv', index=False)