In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# counties_senate_districts = gpd.read_file('data/election/cb_2018_us_county_within_senate_districts/cb_2018_us_county_within_cd116_500k.shp')
counties = gpd.read_file('data/election/cb_2018_us_county_5m/cb_2018_us_county_5m.shp')
election_senate_2022_county = pd.read_csv('data/election/senate_2022.csv')
education_counties = pd.read_csv('data/election/us_counties_education_1970_2022.csv')
unemployment_counties = pd.read_csv('data/election/us_counties_unemployment_2000_2022.csv')
poverity_counties = pd.read_csv('data/election/us_counties_poverty_2021.csv')
population_counties = pd.read_csv('data/election/us_counties_population_2020_2023.csv')

In [None]:
# for all STATEFP in counties, remove the leading 0
counties['STATEFP'] = counties['STATEFP'].astype(int).astype(str)

In [None]:
counties['FIPS Code'] = counties['STATEFP'] + counties['COUNTYFP']

In [None]:
education_counties['FIPS Code'] = education_counties['FIPS Code'].astype(str)
unemployment_counties['FIPS Code'] = unemployment_counties['FIPS_Code'].astype(str)
poverity_counties['FIPS Code'] = poverity_counties['FIPS_Code'].astype(str)
population_counties['FIPS Code'] = population_counties['FIPStxt'].astype(str)

In [None]:
# merge the education data with the counties data on the FIPS Code column
education_counties_merged = education_counties.merge(counties, on='FIPS Code')
# merge the unemployment data with the counties data on the FIPS Code column
unemployment_counties_merged = unemployment_counties.merge(counties, on='FIPS Code')
# merge the poverity data with the counties data on the FIPS Code column
poverity_counties_merged = poverity_counties.merge(counties, on='FIPS Code')
# merge the population data with the counties data on the FIPS Code column
population_counties_merged = population_counties.merge(counties, on='FIPS Code')

In [None]:
poverity_counties_merged.drop(columns=['STATEFP', 'COUNTYFP',
       'COUNTYNS', 'AFFGEOID', 'GEOID', 'NAME', 'LSAD', 'ALAND', 'AWATER',
       'geometry'], inplace=True)
unemployment_counties_merged.drop(columns=['STATEFP', 'COUNTYFP',
       'COUNTYNS', 'AFFGEOID', 'GEOID', 'NAME', 'LSAD', 'ALAND', 'AWATER',
       'geometry'], inplace=True)
population_counties_merged.drop(columns=['STATEFP', 'COUNTYFP',
       'COUNTYNS', 'AFFGEOID', 'GEOID', 'NAME', 'LSAD', 'ALAND', 'AWATER',
       'geometry'], inplace=True)

In [None]:
# merge all the dataframes
merged = education_counties_merged.merge(unemployment_counties_merged, on='FIPS Code')
merged = merged.merge(poverity_counties_merged, on='FIPS Code')
merged = merged.merge(population_counties_merged, on='FIPS Code')

In [None]:
merged_geo = gpd.GeoDataFrame(merged, geometry=merged['geometry'])

In [None]:
# drop columns from merged_geo containing any of the following strings in the col name: 'Urban', 'Rural', '1970', '1980', '1990'
cols_to_drop = [col for col in merged_geo.columns if 'Urban' in col or 'Rural' in col or '1970' in col or '1980' in col or '1990' in col]
merged_geo.drop(columns=cols_to_drop, inplace=True)
merged_geo.drop(columns=['STATEFP', 'COUNTYFP', 'COUNTYNS', 'AFFGEOID', 'GEOID', 'NAME', 'LSAD'], inplace=True)
merged_geo.drop(columns=['FIPS_Code_x', 'State_y', 'Area_Name_x', 'FIPS_Code_y', 'Stabr', 'Area_name', 'State', 'Area_Name_y'], inplace=True)
merged_geo.rename(columns={'State_x': 'State'}, inplace=True)
merged_geo = merged_geo[~merged_geo['State'].isin(['AK', 'HI', 'DC'])]
merged_geo.head()

In [None]:
# in counties, show entries where NAME is 'Orleans'
counties[counties['NAME'] == 'Orleans']

In [None]:
# save the merged data to a geojson file
merged_geo.to_file('data/election/final_data/counties_data.geojson', driver='GeoJSON')

In [None]:
# read the geojson file
county_level_data = gpd.read_file('data/election/final_data/counties_data.geojson')

In [None]:
county_level_data.head()

In [None]:
# add a column to county level data that is the full name of the state, call it State_name
state_names = {
    'AL': 'Alabama',
    'AZ': 'Arizona',
    'AR': 'Arkansas',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'IA': 'Iowa',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'MD': 'Maryland',
    'MA': 'Massachusetts',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MS': 'Mississippi',
    'MO': 'Missouri',
    'MT': 'Montana',
    'NE': 'Nebraska',
    'NV': 'Nevada',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NY': 'New York',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VT': 'Vermont',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WV': 'West Virginia',
    'WI': 'Wisconsin',
    'WY': 'Wyoming'
}

county_level_data['State_name'] = county_level_data['State'].map(state_names)

In [None]:
county_level_data.to_file('data/election/final_data/counties_data.geojson', driver='GeoJSON')

In [None]:
# create a column called state_county that is the concatenation of the state and county name, underscore separated
county_level_data['state_county'] = county_level_data['State_name'] + '_' + county_level_data['Area name']

In [None]:
# change state_county to lower case
county_level_data['state_county'] = county_level_data['state_county'].str.lower()

In [None]:
# county_level_data[~county_level_data['state_county'].str.contains('county')]
# show the uniqe ending words in the state_county column (after the last space)
county_level_data['state_county'].str.split('_').str[-1].unique()

In [None]:
# show all rows where state_county starts with maine
county_level_data[county_level_data['state_county'].str.startswith('maine')]

In [None]:
# show all rows where state_county starts with maine
county_level_data[county_level_data['state_county'].str.startswith('maine')]

In [None]:
# scatterplot of PCTPOVALL_2021 vs Unemployment_rate_2021, include a best fit line
fig, ax = plt.subplots()
merged_geo.plot.scatter(x='PCTPOVALL_2021', y='Unemployment_rate_2021', ax=ax)
plt.show()

In [None]:
# read in the 2016 and 2020 election data
election_2016 = pd.read_csv('data/election/2016_US_County_Level_Presidential_Results.csv')
election_2020 = pd.read_csv('data/election/2020_US_County_Level_Presidential_Results.csv')

In [None]:
# read the 2012 election data
election_2012 = pd.read_csv('data/election/US_County_Level_Presidential_Results_12-16.csv')

In [None]:
election_2012

In [None]:
# drop columns from election_2012 containing any of the following strings in the col name: '2016'
cols_to_drop = [col for col in election_2012.columns if '2016' in col]
election_2012.drop(columns=cols_to_drop, inplace=True)

In [None]:
election_2020

In [None]:
election_2016 = election_2016[~election_2016.state_abbr.isin(['AK', 'HI', 'DC'])]
election_2020 = election_2020[~election_2020.state_name.isin(['Alaska', 'Hawaii', 'District of Columbia'])]
election_2012 = election_2012[~election_2012.state_abbr.isin(['AK', 'HI', 'DC'])]

In [None]:
election_2012

In [None]:
election_2016

In [None]:
election_2020

In [None]:
# in both election_2016 and election_2020, append _2016 or _2020 to the end of the column names where the name is in votes_gop	votes_dem	total_votes	diff	per_gop	per_dem
election_2016.columns = election_2016.columns.str.replace('votes_gop', 'votes_gop_2016')
election_2016.columns = election_2016.columns.str.replace('votes_dem', 'votes_dem_2016')
election_2016.columns = election_2016.columns.str.replace('total_votes', 'total_votes_2016')
election_2016.columns = election_2016.columns.str.replace('diff', 'diff_2016')
election_2016.columns = election_2016.columns.str.replace('per_gop', 'per_gop_2016')
election_2016.columns = election_2016.columns.str.replace('per_dem', 'per_dem_2016')

election_2020.columns = election_2020.columns.str.replace('votes_gop', 'votes_gop_2020')
election_2020.columns = election_2020.columns.str.replace('votes_dem', 'votes_dem_2020')
election_2020.columns = election_2020.columns.str.replace('total_votes', 'total_votes_2020')
election_2020.columns = election_2020.columns.str.replace('diff', 'diff_2020')
election_2020.columns = election_2020.columns.str.replace('per_gop', 'per_gop_2020')
election_2020.columns = election_2020.columns.str.replace('per_dem', 'per_dem_2020')

In [None]:
# rename county_fips in election_2020 to combined_fips
election_2020.rename(columns={'county_fips': 'combined_fips'}, inplace=True)

In [None]:
# change the combined_fips column in both election_2016 and election_2020 to string
election_2016['combined_fips'] = election_2016['combined_fips'].astype(str)
election_2020['combined_fips'] = election_2020['combined_fips'].astype(str)
election_2012['combined_fips'] = election_2012['combined_fips'].astype(str)

In [None]:
# merge election_2020 with merged_geo on the FIPS Code column
election_2012_merged = election_2012.merge(merged_geo, left_on='combined_fips', right_on='FIPS Code')

In [None]:
election_2012_merged

In [None]:
# merge 2016 into the 2020_merged data on the combined_fips column
election_2012_2016_merged = election_2012_merged.merge(election_2016, on='combined_fips')

In [None]:
election_2012_2016_merged

In [None]:
# merge 2012 into the 2016_2020_merged data on the combined_fips column
election_2012_2016_2020_merged = election_2012_2016_merged.merge(election_2020, on='combined_fips')

In [None]:
election_2012_2016_2020_merged

In [None]:
# remove all commas from values in all columns with object type
election_2012_2016_2020_merged = election_2012_2016_2020_merged.apply(lambda x: x.str.replace(',', '') if x.dtype == 'object' else x)

In [None]:
election_2012_2016_2020_merged.drop(columns=['POV04_2021', 'CI90LB04_2021', 'CI90UB04_2021', 'PCTPOV04_2021', 'CI90LB04P_2021', 'CI90UB04P_2021'], inplace=True)

In [None]:
election_2012_2016_2020_merged.drop(columns=['county_name_x', 'combined_fips', 'FIPStxt', 'county_name_y'], inplace=True)

In [None]:
election_2012_2016_2020_merged.drop(columns=['Unnamed: 0_x', 'FIPS', 'county_fips', 'state_fips', 'state_abbr_y', 'Unnamed: 0_y', 'Area name'], inplace=True)

In [None]:
election_2012_2016_2020_merged.drop(columns=['State'], inplace=True)

In [None]:
# election_2012_2016_2020_merged, show columns that contain 2012
election_2012_2016_2020_merged.columns[election_2012_2016_2020_merged.columns.str.contains('2012')]

In [None]:
# in election_2012_2016_2020_merged, create a columns called winner_2012, winner_2016 and winner_2020 that is the name of the winner of the election in that year
election_2012_2016_2020_merged['winner_2012'] = np.where(election_2012_2016_2020_merged['per_gop_2012'] > election_2012_2016_2020_merged['per_dem_2012'], 'GOP', 'DEM')
election_2012_2016_2020_merged['winner_2016'] = np.where(election_2012_2016_2020_merged['per_gop_2016'] > election_2012_2016_2020_merged['per_dem_2016'], 'GOP', 'DEM')
election_2012_2016_2020_merged['winner_2020'] = np.where(election_2012_2016_2020_merged['per_gop_2020'] > election_2012_2016_2020_merged['per_dem_2020'], 'GOP', 'DEM')

In [None]:
# rename state_abbr_x to state_abbr
election_2012_2016_2020_merged.rename(columns={'state_abbr_x': 'state_abbr'}, inplace=True)

In [None]:
# president counties 2020
president_counties_2020 = pd.read_csv('data/election/president_county_candidate.csv')

In [None]:
president_counties_2020 = president_counties_2020[~president_counties_2020.state.isin(['Alaska', 'Hawaii', 'District of Columbia'])]
# drop rows where party is not 'DEM' or 'REP'
president_counties_2020 = president_counties_2020[president_counties_2020.party.isin(['DEM', 'REP'])]

In [None]:
len(president_counties_2020.state.unique())

In [None]:
president_counties_2020['state_county'] = president_counties_2020['state'] + '_' + president_counties_2020['county']

In [None]:
# we now reduce the number of rows by having each county appear only once.  the columns candidate, party, total_votes, won are translated into columns as such: candidate_dem, candidate_rep, party_dem, party_rep, total_votes_dem, total_votes_rep, won_dem, won_rep
president_counties_2020_fixed = president_counties_2020.pivot(index='state_county', columns='party', values=['candidate', 'total_votes', 'won'])
president_counties_2020_fixed.columns = ['_'.join(col).strip() for col in president_counties_2020_fixed.columns.values]
president_counties_2020_fixed.reset_index(inplace=True)
president_counties_2020_fixed

In [None]:
# add a column to president_counties_2020_fixed that is called winner_2020, which is 'DEM' if won_dem == True, 'REP' if won_rep == True
president_counties_2020_fixed['winner_2020'] = np.where(president_counties_2020_fixed['won_DEM'] == True, 'DEM', 'REP')
president_counties_2020_fixed

In [None]:
# show rows in state_county that do not have the word 'County' in them
president_counties_2020_fixed[president_counties_2020_fixed.state_county.str.contains('city')]

In [None]:
# for values in predident_counties_2020_fixed where state_county does not contain 'Parish' or 'city', append ' County' to the end of the value
president_counties_2020_fixed['state_county'] = np.where(president_counties_2020_fixed['state_county'].str.contains('Parish') | president_counties_2020_fixed['state_county'].str.contains('city') | president_counties_2020_fixed['state_county'].str.contains('County'), president_counties_2020_fixed['state_county'], president_counties_2020_fixed['state_county'] + ' County')

In [None]:
president_counties_2020_fixed

In [None]:
def diagnose_merge_issues(df1, df2, key_column, df1_name='Left DataFrame', df2_name='Right DataFrame'):
    """
    Diagnose why certain rows are being dropped in a merge operation.
    
    Args:
        df1 (pandas.DataFrame): First DataFrame
        df2 (pandas.DataFrame): Second DataFrame
        key_column (str): Column name used for merging
        df1_name (str): Name for first DataFrame for reporting
        df2_name (str): Name for second DataFrame for reporting
    """
    # Check for duplicates in merge column
    print(f"\nChecking for duplicates in merge column '{key_column}':")
    print(f"{df1_name} duplicates: {df1[key_column].duplicated().sum()}")
    print(f"{df2_name} duplicates: {df2[key_column].duplicated().sum()}")
    
    if df1[key_column].duplicated().sum() > 0:
        print(f"\nDuplicate values in {df1_name}:")
        print(df1[df1[key_column].duplicated(keep=False)][key_column].sort_values())
    
    if df2[key_column].duplicated().sum() > 0:
        print(f"\nDuplicate values in {df2_name}:")
        print(df2[df2[key_column].duplicated(keep=False)][key_column].sort_values())
    
    # Check for missing values
    print(f"\nChecking for missing values in '{key_column}':")
    print(f"{df1_name} missing values: {df1[key_column].isna().sum()}")
    print(f"{df2_name} missing values: {df2[key_column].isna().sum()}")
    
    # Compare value distributions
    print(f"\nComparing unique values in '{key_column}':")
    df1_values = set(df1[key_column].dropna())
    df2_values = set(df2[key_column].dropna())
    
    print(f"\nTotal unique values:")
    print(f"{df1_name}: {len(df1_values)}")
    print(f"{df2_name}: {len(df2_values)}")
    
    # Find values in df1 that aren't in df2 and vice versa
    only_in_df1 = df1_values - df2_values
    only_in_df2 = df2_values - df1_values
    
    if only_in_df1:
        print(f"\nValues only in {df1_name} (showing first 10):")
        print(sorted(list(only_in_df1))[:10])
    
    if only_in_df2:
        print(f"\nValues only in {df2_name} (showing first 10):")
        print(sorted(list(only_in_df2))[:10])
    
    # Check for leading/trailing whitespace
    print(f"\nChecking for leading/trailing whitespace in '{key_column}':")
    df1_space = df1[df1[key_column].astype(str).str.strip() != df1[key_column].astype(str)]
    df2_space = df2[df2[key_column].astype(str).str.strip() != df2[key_column].astype(str)]
    
    if len(df1_space) > 0:
        print(f"\nValues with whitespace in {df1_name} (showing first 5):")
        print(df1_space[key_column].head())
    
    if len(df2_space) > 0:
        print(f"\nValues with whitespace in {df2_name} (showing first 5):")
        print(df2_space[key_column].head())
    
    # Check for case differences
    print(f"\nChecking for case differences:")
    df1_lower = set(df1[key_column].astype(str).str.lower())
    df2_lower = set(df2[key_column].astype(str).str.lower())
    case_diff = len(df1_values.intersection(df2_values)) != len(df1_lower.intersection(df2_lower))
    
    if case_diff:
        print("Warning: Found case differences in matching values!")
        
    # Sample a few unmatched values for detailed comparison
    if only_in_df1 or only_in_df2:
        print("\nDetailed comparison of a few unmatched values:")
        for val in list(only_in_df1)[:3]:
            close_matches = [x for x in df2_values if str(val).lower() in str(x).lower() or str(x).lower() in str(val).lower()]
            if close_matches:
                print(f"\nValue in {df1_name}: '{val}'")
                print(f"Similar values in {df2_name}: {close_matches}")

def fix_merge_issues(df1, df2, key_column):
    """
    Try to fix common merge issues in both DataFrames.
    
    Args:
        df1 (pandas.DataFrame): First DataFrame
        df2 (pandas.DataFrame): Second DataFrame
        key_column (str): Column name used for merging
        
    Returns:
        tuple: (cleaned_df1, cleaned_df2)
    """
    # Create copies to avoid modifying originals
    df1_clean = df1.copy()
    df2_clean = df2.copy()
    
    # Strip whitespace
    if df1_clean[key_column].dtype == object:
        df1_clean[key_column] = df1_clean[key_column].str.strip()
    if df2_clean[key_column].dtype == object:
        df2_clean[key_column] = df2_clean[key_column].str.strip()
    
    # Remove any duplicate rows based on the key column
    df1_clean = df1_clean.drop_duplicates(subset=[key_column])
    df2_clean = df2_clean.drop_duplicates(subset=[key_column])
    
    return df1_clean, df2_clean

In [None]:
diagnose_merge_issues(county_level_data, president_counties_2020_fixed, 'state_county', 'county_level_data', 'president_counties_2020_fixed')

In [None]:
# change state_county to lower case for president_counties_2020_fixed
president_counties_2020_fixed['state_county'] = president_counties_2020_fixed['state_county'].str.lower()

In [None]:
# merge the president_counties_2020_fixed data with the county_level_data data on the state_county column
president_counties_2020_merged = county_level_data.merge(president_counties_2020_fixed, on='state_county')

In [None]:
len(president_counties_2020_merged.State.unique())

In [None]:
president_counties_2020_merged.to_file('data/election/final_data/county_data_with_2020_election.geojson', driver='GeoJSON')

In [None]:
president_counties_2020_merged = gpd.read_file('data/election/final_data/county_data_with_2020_election.geojson')

In [None]:
# remove all commas from values in all columns with object type
president_counties_2020_merged = president_counties_2020_merged.apply(lambda x: x.str.replace(',', '') if x.dtype == 'object' else x)

In [None]:
president_counties_2020_merged.drop(columns=['POV04_2021', 'CI90LB04_2021', 'CI90UB04_2021', 'PCTPOV04_2021', 'CI90LB04P_2021', 'CI90UB04P_2021'], inplace=True)

In [None]:
len(president_counties_2020_merged.State.unique())

In [None]:
# make election_2012_2016_2020_merged as gdf
president_counties_merged = gpd.GeoDataFrame(election_2012_2016_2020_merged, geometry=election_2012_2016_2020_merged['geometry'])

In [None]:
# save president_counties_2020_merged as geojson
president_counties_2020_merged.to_file('data/election/final_data/county_data_final.geojson', driver='GeoJSON')

In [None]:
president_counties_merged.to_file('data/election/final_data/county_data_with_elections_2012_2016_2020.geojson', driver='GeoJSON')

In [None]:
counties_2020_data = gpd.read_file('data/election/final_data/county_data_with_2020_election.csv')

In [None]:
counties_2020_data

In [None]:
counties_2020_data.set_geometry('geometry')

In [None]:
# drop POV04_2021	CI90LB04_2021	CI90UB04_2021	PCTPOV04_2021	CI90LB04P_2021	CI90UB04P_2021 from counties_2020_data
counties_2020_data.drop(columns=['POV04_2021', 'CI90LB04_2021', 'CI90UB04_2021', 'PCTPOV04_2021', 'CI90LB04P_2021', 'CI90UB04P_2021'], inplace=True)
# save as geojson
counties_2020_data.to_file('data/election/final_data/county_data_with_2020_election_results.geojson', driver='GeoJSON')