In [None]:
# import all important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px


Data Sources:

Census API: https://api.census.gov/data/{year}/acs/acs5

U.S. President 1976–2020( Data for Year/State wise presedential race winners): https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/42MVDX

U.S. President 2000–2020( Data for Year/County wise presedential race winners): https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/VOQCHQ

Electoral College Weigtage: https://uselectionatlas.org/INFORMATION/INFORMATION/evotes.php

IPUMS data: https://usa.ipums.org/usa-action/variables/group

In [None]:
def load_data():

  # presedential election data
  file_id = '142QETIzMmGo-DLlj4fcjZVXcd_QMLvf7'
  direct_url = f'https://drive.google.com/uc?id={file_id}'

  # Load the dataset
  data = pd.read_csv(direct_url)

	# electoral college weightage

  file_id = '1AcHgjAHhMqRyAB0pHkA9cD9lJm4TaixV'
  direct_url = f'https://drive.google.com/uc?id={file_id}'

  election_college_wt = pd.read_csv(direct_url)

  return (data,election_college_wt)


In [None]:
def dataprep_presedentional_election_results(input_df):

  """ This function cleans and transforms the presedential election result data.
      Unique row for year | state | winner/loser metadata | Vote share diff"""

  data = input_df.copy()
	# keeping important columns
  data_trimmed_v1 = data[['year','state', 'state_po','state_fips','party_simplified','candidatevotes','totalvotes']]

	# adding vote share column
  data_trimmed_v1.loc[:,['perc_voteshare']] = data_trimmed_v1['candidatevotes']/data_trimmed_v1['totalvotes']

	# removing parties other than democrat and republican
  data_f1 = data_trimmed_v1.loc[data_trimmed_v1['party_simplified'].isin(['DEMOCRAT','REPUBLICAN'])]

	# sort by year , state and candiatevotes
  data_s1 = data_f1.sort_values(by = ['year','state','candidatevotes'], ascending = False)

	# finding winner and vote share difference each year

  data_transform_v1 = data_s1.groupby(['year','state','state_po'],group_keys=False).apply(lambda x : pd.Series
 	(
     {
         'winner_party': x.iloc[0,4]
         ,'losing_party' : x.iloc[1,4]
         ,'winning_party_candidatevotes' : x.iloc[0,5]
         ,'winning_party_percvoteshare' : x.iloc[0,7]
         ,'voteshare_perc_diff' : x.iloc[0,7] - x.iloc[1,7]
     }

	)).reset_index()


	# converting year to type int
  data_transform_v1['year'] = data_transform_v1['year'].astype('int')

  return (data_transform_v1)

In [None]:
def dataprep_electoral_college(df_input):

  """ This function manipulates electoral college weightage (state wise) dataset. Unique at state | year | seats(abs , perc)"""

  election_college_wt = df_input.copy()

  # 1 (melt dataframe)
  election_college_tranform_1 = election_college_wt.melt(id_vars = ['State'] , value_vars = ['1976','1980','1984','1988',
  '1992','1996','2000','2004','2008','2012','2016','2020','2024',] , var_name = 'year' , value_name = 'no_of_seats')

  # 2 (transform state & year column for join)
  election_college_tranform_1['State'] = election_college_tranform_1['State'].str.upper()
  election_college_tranform_1['year'] = election_college_tranform_1['year'].astype(int)

  # 3 (rename columns after join)
  election_college_tranform_1.rename(columns = {'State' : 'state'}, inplace = True)

  # 4 (add perc ec state for every election year and state)
  election_college_tranform_1['perc_ec_seats'] = election_college_tranform_1['no_of_seats']/538.0

  return(election_college_tranform_1)

In [None]:
def add_flipflag(df_input):

  """ This function adds a flag for every year to decide if the state flipped i.e previous winner and current winner are different """

  data_transform_v2 = df_input.copy()

  data_transform_v3 = data_transform_v2.sort_values(by = ['state','year'])
  data_transform_v3['prev_election_winner'] = data_transform_v3['winner_party'].shift(1)

  # add a flip flag
  data_transform_v3['flip_flag'] = np.where(data_transform_v3['prev_election_winner'] == data_transform_v3['winner_party'],0,1)

  return(data_transform_v3)

In [None]:
def feature_engineering(df_input,from_year,min_no_of_seats):

  """ This functions adds features : flips | avg & std of vote share diff | vote share towards electoral college : All normalized between 0 and 1"""

  data_transform_v3 = df_input.copy()

  condition_set_1 = (data_transform_v3['year']>=2004) & (data_transform_v3['no_of_seats']>=5)

  data_transform_v4 = data_transform_v3[condition_set_1].groupby(['state']).agg({
    'voteshare_perc_diff' : ['mean','std'],
    'perc_ec_seats': 'min',
    'flip_flag' : 'sum'})

  # drop column levels
  data_transform_v4.columns = data_transform_v4.columns.droplevel(0)

  # rename columns

  data_transform_v4.rename(columns = {'mean' : 'avg_voteshare_pct_diff' , 'std' : 'std_voteshare_pct_diff' , 'sum' : 'total_flips' , 'min' : 'ec_pct_share'} , inplace = True)

  # atleast 1 flip since 2000
  condition_set_2 = (data_transform_v4['total_flips']>=1)

  data_transform_v4 = data_transform_v4[condition_set_2].reset_index()

  # normalizing features

  scaler = MinMaxScaler()

  data_transform_v4[['norm_avg_voteshare_pct_diff', 'norm_std_voteshare_pct_diff', 'norm_ec_pct_share', 'norm_total_flips']] = 0.1 + scaler.fit_transform(
    data_transform_v4[['avg_voteshare_pct_diff', 'std_voteshare_pct_diff', 'ec_pct_share', 'total_flips']])

  return(data_transform_v4)


In [None]:
def calc_swing_score(data_transform_v4,norm_avg_voteshare_pct_diff_wt,norm_std_voteshare_pct_diff_wt,norm_ec_pct_share_wt,norm_total_flips_wt):
  """ This function calc swing score by adding all normalized features against every state for every election year"""
  data_transform_v4['swing_score'] = ((1-data_transform_v4['norm_avg_voteshare_pct_diff'])*norm_avg_voteshare_pct_diff_wt
+ (1-data_transform_v4['norm_std_voteshare_pct_diff']) * norm_std_voteshare_pct_diff_wt + data_transform_v4['norm_ec_pct_share'] * norm_ec_pct_share_wt + data_transform_v4['norm_total_flips'] * norm_total_flips_wt)

  return(data_transform_v4)


In [None]:
def swing_for_2024(df_input, data_transform_v5):

  """ This function adds a recency bias to find swing states for 2024 by sorting by the states that did swing in 2020"""

  data_transform_v3 = df_input.copy()

  filter_c1 = data_transform_v3['year'].isin([2020])

  # finding states that flipped in 2020

  data_transform_v6 = pd.merge(data_transform_v3[filter_c1],data_transform_v5[['state','swing_score']],left_on = ['state'] , right_on = ['state'] , how = 'left')

  data_transform_v6.iloc[:,11].fillna(0,inplace = True)

  # adding in the recency element to swing score

  data_transform_v6['final_swing_score'] = data_transform_v6['flip_flag'] + data_transform_v6['swing_score']

  return(data_transform_v6)

In [None]:
def viz_swing_states(data_transform_v8):

  """ This func is used to viz swing states (The usual suspects - Top 10) over the years since 1976 onwards"""
  color_map = {'DEMOCRAT': 'blue', 'REPUBLICAN': 'red'}

  # Plotly Choropleth map with timeline
  fig = px.choropleth(data_transform_v8,
                      locations="state_po",
                      color="winner_party",
                      hover_name="state",
                      locationmode="USA-states",
                      animation_frame="year",
                      scope="usa",
                      color_discrete_map=color_map,
                      title="Swing States in US Elections (1976-2020) Based on Swing Score")

  # Update layout
  fig.update_layout(
      geo_scope='usa',
      title="Swing States in US Elections (1976-2020)"
  )

  fig.show()

In [None]:
def main():

  # loading data
  data,election_college_wt = load_data()

  # intital transformation of the datasets
  data_transform_v1 = dataprep_presedentional_election_results(data)
  election_college_tranform_1 = dataprep_electoral_college(election_college_wt)

  # merging the dataset (adding election college seat share for every state to the main dataframe)
  data_transform_v2 = pd.merge(data_transform_v1,election_college_tranform_1,left_on = ['state','year'] , right_on = ['state','year'] , how = 'inner')

  # add a flip or not flip flag for every election cycle
  data_transform_v3 = add_flipflag(data_transform_v2)


  # Feature engineering
  ## conditions to be eligible for swing score calc
  from_year = 2004
  min_no_of_seats = 5
  data_transform_v4 = feature_engineering(data_transform_v3,from_year,min_no_of_seats)

  # Calc swing score
  ## assign weights
  norm_avg_voteshare_pct_diff_wt = 0.2
  norm_std_voteshare_pct_diff_wt = 0.2
  norm_ec_pct_share_wt = 0.4
  norm_total_flips_wt = 0.2
  data_transform_v4 = calc_swing_score(data_transform_v4,norm_avg_voteshare_pct_diff_wt,norm_std_voteshare_pct_diff_wt,norm_ec_pct_share_wt,norm_total_flips_wt)

  # the usual suspects across all elections
  data_transform_v5 = data_transform_v4.sort_values(by = 'swing_score' , ascending = False).reset_index(drop = True)

  # Finding swing for 2024 (adding a recency bias)

  data_transform_v6 = swing_for_2024(data_transform_v3, data_transform_v5)

  # swing for 2024

  data_transform_v7 = data_transform_v6.sort_values(by = ['final_swing_score'] , ascending = [False])

  # dataset for trends of swing states over the years

  swing_states = data_transform_v5.head(10)['state'].tolist()
  data_transform_v8 = data_transform_v3[data_transform_v3['state'].isin(swing_states)][['year','state','winner_party','state_po']].reset_index(drop = True)

  # visualise the swing states (overall)
  viz_swing_states(data_transform_v8)


In [None]:
if __name__ == "__main__":
    main()


  data_transform_v1 = data_s1.groupby(['year','state','state_po'],group_keys=False).apply(lambda x : pd.Series
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_transform_v6.iloc[:,11].fillna(0,inplace = True)
