In [1]:
import pandas as pd
historical_senate = pd.read_csv('beastModeData/senate_polls_historical.csv')
historical_senate

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,election_date,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,party,answer,candidate_id,candidate_name,pct
0,81762,1250,Trafalgar Group,,,Trafalgar Group,338,Trafalgar Group,0.7,0.6,...,12/6/22,runoff,False,False,,DEM,Warnock,19086,Raphael Warnock,51.1
1,81762,1250,Trafalgar Group,,,Trafalgar Group,338,Trafalgar Group,0.7,0.6,...,12/6/22,runoff,False,False,,REP,Walker,19088,Herschel Junior Walker,47.4
2,81760,1515,Data for Progress,,,Data for Progress,522,Data for Progress,2.6,-1.2,...,12/6/22,runoff,False,False,,DEM,Warnock,19086,Raphael Warnock,51.0
3,81760,1515,Data for Progress,,,Data for Progress,522,Data for Progress,2.6,-1.2,...,12/6/22,runoff,False,False,,REP,Walker,19088,Herschel Junior Walker,49.0
4,81759,235,InsiderAdvantage,195,Fox 5 Atlanta,InsiderAdvantage,243,InsiderAdvantage,2.0,-0.3,...,12/6/22,runoff,False,False,,DEM,Warnock,19086,Raphael Warnock,50.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6653,53946,290,MassINC Polling Group,83,WBUR,MassINC Polling Group,198,MassINC Polling Group,2.8,-0.8,...,11/6/18,general,False,False,,REP,Diehl,12481,Geoff Diehl,29.0
6654,52630,610,Texas Lyceum,,,Texas Lyceum,431,Texas Lyceum,,,...,11/6/18,general,False,False,,DEM,O'Rourke,11125,Beto O'Rourke,30.0
6655,52630,610,Texas Lyceum,,,Texas Lyceum,431,Texas Lyceum,,,...,11/6/18,general,False,False,,REP,Cruz,11126,Ted Cruz,30.0
6656,52643,988,Harper Polling,,,Harper Polling,132,Harper Polling,,-0.3,...,11/6/18,general,False,False,,DEM,Manchin,11132,"Joe Manchin, III",57.0


In [2]:
import re

wanted_variables = ['candidate_name','party','cycle','office_type','race_id', 'state','pollscore', 'pct','created_at']
poll = historical_senate[wanted_variables]

poll.loc[:, 'created_at'] = pd.to_datetime(poll['created_at'], format="%m/%d/%y %H:%M")
poll = poll.sort_values(by='created_at', ascending=False)

# Drop duplicates based on 'CAND_ID', keeping the latest row
poll = poll.drop_duplicates(subset=['candidate_name'], keep='first')
poll = poll.reset_index(drop=True).drop('created_at', axis=1)

# Normalization Function
def normalize_name(name):
    # Convert to lowercase
    name = name.lower()
    # Strip whitespace
    name = name.strip()
    # Remove special characters (optional)
    name = re.sub(r'[^a-zA-Z\s]', '', name)  # Removes punctuation and special characters
    return name

poll.loc[:, 'candidate_name'] = poll['candidate_name'].apply(normalize_name)

poll

Unnamed: 0,candidate_name,party,cycle,office_type,race_id,state,pollscore,pct
0,herschel junior walker,REP,2022,U.S. Senate,8925,Georgia,-0.8,46.5
1,catherine cortez masto,DEM,2022,U.S. Senate,8937,Nevada,0.6,45.4
2,raphael warnock,DEM,2022,U.S. Senate,8925,Georgia,0.7,59.0
3,mehmet oz,REP,2022,U.S. Senate,8945,Pennsylvania,0.7,47.0
4,john fetterman,DEM,2022,U.S. Senate,8945,Pennsylvania,0.7,53.0
...,...,...,...,...,...,...,...,...
432,jason shelton,DEM,2018,U.S. Senate,130,Mississippi,0.0,6.2
433,joe arpaio,REP,2018,U.S. Senate,96,Arizona,0.2,35.0
434,don blankenship,IND,2018,U.S. Senate,126,West Virginia,0.0,11.0
435,aubrey dunn,LIB,2018,U.S. Senate,114,New Mexico,0.4,5.3


In [3]:
# Group by the 'cycle' column and create a dictionary of DataFrames
cycle_dataframes = {cycle: group for cycle, group in poll.groupby('cycle')}

In [4]:
# Group candidates by race_id for each cycle DataFrame
grouped_candidates = {}
for cycle, group in cycle_dataframes.items():
    grouped_candidates[cycle] = group.groupby('race_id').agg({
        'candidate_name': list,  # Aggregate names into lists
        'party': 'first',        # Take the first party
        'cycle': 'first',        # Take the first cycle (same for all in the group)
        'office_type': 'first',  # Take the first office_type (same for all in the group)
        'state': 'first',
        'pollscore': list,
        'pct': list
    })
# Build separate DataFrames for each cycle
dataframes_by_cycle = {}
for cycle, grouped in grouped_candidates.items():
    dataframes_by_cycle[cycle] = grouped

In [5]:
dataframes_by_cycle[2022]

Unnamed: 0_level_0,candidate_name,party,cycle,office_type,state,pollscore,pct
race_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8917,"[will boyd, katie britt, john sophocleus]",DEM,2022,U.S. Senate,Alabama,"[-1.1, -1.1, -1.1]","[27.5, 57.1, 6.3]"
8918,"[kelly c tshibaka, lisa murkowski, buzz a kell...",REP,2022,U.S. Senate,Alaska,"[-0.1, -0.1, -0.1, -0.1, 0.7, 0.7, -0.1, -0.1,...","[44.5, 55.5, 3.9, 16.4, 16.0, 4.0, 4.6, 20.0, ..."
8919,"[marc victor, blake masters, mark kelly, mark ...",LIB,2022,U.S. Senate,Arizona,"[0.1, 0.1, 0.1, 0.4, 0.4, -1.2, -0.4, -0.4, -0...","[2.0, 47.0, 50.0, 40.0, 34.0, 47.0, 37.0, 36.0..."
8920,"[kenneth cates, natalie james, john boozman]",LIB,2022,U.S. Senate,Arkansas,"[-0.4, -0.4, -0.4]","[4.6, 25.8, 54.6]"
8921,[alex padilla],DEM,2022,U.S. Senate,California,[-0.5],[53.0]
8922,"[joe odea, michael bennet, brian peotter, fran...",REP,2022,U.S. Senate,Colorado,"[0.3, 0.3, -1.2, -1.1, -1.1, -0.5, -0.3, -0.5,...","[43.0, 51.0, 2.0, 0.5, 1.2, 37.0, 40.1, 35.0, ..."
8923,"[leora r levy, richard blumenthal, themis klar...",REP,2022,U.S. Senate,Connecticut,"[1.8, 1.8, 1.6, -1.1]","[26.0, 51.0, 34.0, 35.1]"
8924,"[val demings, marco rubio, dennis misigoy, ste...",DEM,2022,U.S. Senate,Florida,"[-1.2, -1.2, -1.2, -1.0, -1.0, 0.7, 0.4]","[43.0, 55.0, 1.0, 1.4, 0.6, 5.0, 37.0]"
8925,"[herschel junior walker, raphael warnock, chas...",REP,2022,U.S. Senate,Georgia,"[-0.8, 0.7, 0.1, -0.2, 0.0, nan]","[46.5, 59.0, 3.0, 40.7, 44.0, 45.2]"
8928,"[tammy duckworth, kathy salvi, bill redpath]",DEM,2022,U.S. Senate,Illinois,"[-0.5, -0.5, -1.1]","[58.0, 40.0, 2.1]"


In [6]:
# Read the file into a DataFrame, specifying the delimiter as '|'
df18 = pd.read_csv('beastModeData/fecData/weball18.txt', delimiter='|', header=None)
df20 = pd.read_csv('beastModeData/fecData/weball20.txt', delimiter='|', header=None)
df22 = pd.read_csv('beastModeData/fecData/weball22.txt', delimiter='|', header=None)

In [7]:
df18.columns = [
    'CAND_ID', 'CAND_NAME', 'CAND_ICI', 'PTY_CD', 'CAND_PTY_AFFILIATION',
    'TTL_RECEIPTS', 'TRANS_FROM_AUTH', 'TTL_DISB', 'TRANS_TO_AUTH',
    'COH_BOP', 'cash_on_hand', 'CAND_CONTRIB', 'CAND_LOANS', 'OTHER_LOANS',
    'CAND_LOAN_REPAY', 'OTHER_LOAN_REPAY', 'DEBTS_OWED_BY', 'TTL_INDIV_CONTRIB',
    'CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT', 'SPEC_ELECTION', 'PRIM_ELECTION',
    'RUN_ELECTION', 'GEN_ELECTION', 'GEN_ELECTION_PRECENT', 'OTHER_POL_CMTE_CONTRIB',
    'POL_PTY_CONTRIB', 'CVG_END_DT', 'INDIV_REFUNDS', 'CMTE_REFUNDS'
]
# Normalization Function to rearrange names
def normalize_full_name(name):
    # Remove special characters and extra spaces
    name = re.sub(r'[^a-zA-Z\s,]', '', name)  # Remove any unwanted characters
    name = name.strip()
    
    # Split by comma and then by spaces
    parts = name.split(',')
    
    if len(parts) == 2:
        last_name = parts[0].strip()  # Get the last name
        first_middle = parts[1].strip()  # Get the first and middle names
        
        # Combine them in the desired order
        normalized_name = f"{first_middle} {last_name}"  # 'first middle last'
    else:
        normalized_name = name  # Fallback if the format is unexpected
    
    return normalized_name.lower()



# Apply the conversion function to df1
df18['CAND_NAME'] = df18['CAND_NAME'].apply(normalize_full_name)
df18

Unnamed: 0,CAND_ID,CAND_NAME,CAND_ICI,PTY_CD,CAND_PTY_AFFILIATION,TTL_RECEIPTS,TRANS_FROM_AUTH,TTL_DISB,TRANS_TO_AUTH,COH_BOP,...,SPEC_ELECTION,PRIM_ELECTION,RUN_ELECTION,GEN_ELECTION,GEN_ELECTION_PRECENT,OTHER_POL_CMTE_CONTRIB,POL_PTY_CONTRIB,CVG_END_DT,INDIV_REFUNDS,CMTE_REFUNDS
0,H8AK00132,dimitri shein,C,1,DEM,209916.04,0.00,209574.16,0.0,0.00,...,,,,,,0.00,0.0,12/31/2018,0.00,0.0
1,H6AK00045,donald e young,I,2,REP,1234680.31,0.00,1387687.05,0.0,269726.86,...,,,,,,559861.90,0.0,12/31/2018,2700.00,500.0
2,H8AK00116,gabrielle r ledoux,C,2,REP,0.00,0.00,540033.00,0.0,479.00,...,,,,,,0.00,0.0,09/30/2018,0.00,0.0
3,H8AK01031,thomas john nelson,C,2,REP,9288.48,0.00,8821.97,0.0,0.00,...,,,,,,0.00,0.0,12/31/2018,600.00,0.0
4,H8AK00140,alyse galvin,C,3,IND,1949643.68,154.70,1943398.59,0.0,0.00,...,,,,,,114833.97,0.0,12/31/2018,8166.36,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3682,S4WY00097,charles e hardy,C,2,REP,9058.00,0.00,8891.00,0.0,0.00,...,,,,,,0.00,0.0,12/31/2018,0.00,0.0
3683,S4WY00147,bryan edward miller,C,2,REP,0.00,0.00,0.00,0.0,0.00,...,,,,,,0.00,0.0,12/31/2018,0.00,0.0
3684,S6WY00068,john a barrasso,I,2,REP,5667279.48,256656.69,4458971.93,0.0,2749487.00,...,,,,,,2555865.00,0.0,12/31/2018,55404.00,47500.0
3685,S6WY00126,michael b enzi,I,2,REP,232162.96,0.00,332047.38,0.0,580995.00,...,,,,,,190500.00,0.0,12/31/2018,0.00,0.0


In [11]:
duplicate_counts = df18['CAND_NAME'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
total_duplicates = duplicates.sum()
duplicates

CAND_NAME
roque rocky de la fuente    4
james courage singer        2
steve williams              2
loretta sanchez             2
shak hill                   2
                           ..
robert marshall             2
james veltmeyer             2
steven d welch              2
danny tarkanian             2
annette teijeiro            2
Name: count, Length: 68, dtype: int64

In [None]:
matching_candidates = df18['CAND_NAME'].isin(poll['candidate_name'])
sum(matching_candidates)

In [None]:
df20.columns = [
    'CAND_ID', 'CAND_NAME', 'CAND_ICI', 'PTY_CD', 'CAND_PTY_AFFILIATION',
    'TTL_RECEIPTS', 'TRANS_FROM_AUTH', 'TTL_DISB', 'TRANS_TO_AUTH',
    'COH_BOP', 'cash_on_hand', 'CAND_CONTRIB', 'CAND_LOANS', 'OTHER_LOANS',
    'CAND_LOAN_REPAY', 'OTHER_LOAN_REPAY', 'DEBTS_OWED_BY', 'TTL_INDIV_CONTRIB',
    'CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT', 'SPEC_ELECTION', 'PRIM_ELECTION',
    'RUN_ELECTION', 'GEN_ELECTION', 'GEN_ELECTION_PRECENT', 'OTHER_POL_CMTE_CONTRIB',
    'POL_PTY_CONTRIB', 'CVG_END_DT', 'INDIV_REFUNDS', 'CMTE_REFUNDS'
]

# Apply the conversion function to df1
df20['CAND_NAME'] = df20['CAND_NAME'].apply(normalize_full_name)
df20

In [None]:
matching_candidates = df20['CAND_NAME'].isin(poll['candidate_name'])
sum(matching_candidates)

In [None]:
df22.columns = [
    'CAND_ID', 'CAND_NAME', 'CAND_ICI', 'PTY_CD', 'CAND_PTY_AFFILIATION',
    'TTL_RECEIPTS', 'TRANS_FROM_AUTH', 'TTL_DISB', 'TRANS_TO_AUTH',
    'COH_BOP', 'cash_on_hand', 'CAND_CONTRIB', 'CAND_LOANS', 'OTHER_LOANS',
    'CAND_LOAN_REPAY', 'OTHER_LOAN_REPAY', 'DEBTS_OWED_BY', 'TTL_INDIV_CONTRIB',
    'CAND_OFFICE_ST', 'CAND_OFFICE_DISTRICT', 'SPEC_ELECTION', 'PRIM_ELECTION',
    'RUN_ELECTION', 'GEN_ELECTION', 'GEN_ELECTION_PRECENT', 'OTHER_POL_CMTE_CONTRIB',
    'POL_PTY_CONTRIB', 'CVG_END_DT', 'INDIV_REFUNDS', 'CMTE_REFUNDS'
]

# Apply the conversion function to df1
df22['CAND_NAME'] = df22['CAND_NAME'].apply(normalize_full_name)
df22

In [None]:
matching_candidates = df22['CAND_NAME'].isin(poll['candidate_name'])
sum(matching_candidates)