## Functions to identify winners and filter only for races we have winners

---
- Only requires the candidate files
- Uses the 'Incumbent' tag from subsequent race to identfy winner in current race (assumes they run again, data leakage)
- Filter out races in years we don't have an identified winner from method above

In [1]:
pip install fuzzywuzzy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from pyspark.sql import SparkSession
import pyspark.sql.types as typ
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from functools import reduce
from pyspark.sql.functions import col, asc
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType, DoubleType, DateType
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [3]:
spark = SparkSession \
    .builder \
    .getOrCreate()

sc = spark.sparkContext

### Step 1: Load in the multi-year candidate files

In [4]:
def parse_file(file_name, file_header):
    '''Tool to load in an FEC file with associated header and produce a Spark Dataframe'''   

    #load data frame and split by delimiter  
    df_temp = parse_file(file_name).map(lambda row: [elem for elem in row.split('|')])
    
    #load separate header csv file and prepare schema 
    #assumes all values are STRING TYPE for simplicity
    head_temp = pd.read_csv(file_header)
    fields = [*[typ.StructField(h[:], typ.StringType(), True) for h in head_temp.columns]]
    schema = typ.StructType(fields)
    
    #finalize spark dataframe
    df = spark.createDataFrame(df_temp, schema)
    
    return df

In [5]:
#Taking all of the candidate files, reading them through parse_file and joining into a single table

#df_candidate_20 = parse_file('candidate_2020.txt', 'candidate_header_file.csv')
#df_candidate_18 = parse_file('candidate_2018.txt', 'candidate_header_file.csv')
#df_candidate_16 = parse_file('candidate_2016.txt', 'candidate_header_file.csv')


#updated based on the file structure in the repo
df_candidate_20 = parse_file('./fec_data/cn/cn20.txt', './fec_data/cn/cn_header_file.csv')
df_candidate_18 = parse_file('./fec_data/cn/cn18.txt', './fec_data/cn/cn_header_file.csv')
df_candidate_16 = parse_file('./fec_data/cn/cn16.txt', './fec_data/cn/cn_header_file.csv')


dfs = [df_candidate_20, df_candidate_18, df_candidate_16]

df_candidate = reduce(DataFrame.unionAll, dfs)

#converting to a pandas DF for ease of working with
df_candidate = df_candidate.select('*').toPandas() 

In [6]:
len(df_candidate)

22275

### Step 2: Identify WINNERS in a race

In [7]:
def identify_candidates(df, year_race, year_incumbent):
    '''
    df = dataframe, created in the step above
    year_race = string, year of the actual race where we will identify the winner (e.g., '2018')
    year_incumbent = string, election year AFTER the actual race where winner will be incumbent (e.g., '2020')    
    
    output is dataframe
    '''
    ##WINNERS
    #identify the incumbents based on year and identifier 'I' = incumbent in the documentation
    df_winners = df[(df.CAND_ELECTION_YR == year_incumbent) & (df.CAND_ICI == 'I') & (df.CAND_OFFICE != 'P')]
    #there are duplicates listed based on certain changes in status, but we only care about the unique candidate IDs who are incumbents
    df_winners = df_winners.drop_duplicates(subset = ['CAND_ID']).reset_index(drop=True)
    
    
    ##CANDIDATE
    #identify the pool of candidates in the year of the actual race
    df_candidates = df[(df.CAND_ELECTION_YR == year_race)&(df.CAND_OFFICE != 'P')]
    #same issue with duplicates
    df_candidates = df_candidates.drop_duplicates(subset = ['CAND_ID']).reset_index(drop=True)
    
    
    ##MATCHING WINNERS TO CANDIDATES
    #naming issues between years, use FUZZYWUZZY to match name strings from winner, candidate DFs
    df_candidates['WINNER'] = 0

    for name in df_candidates.CAND_NAME:
        ratio = process.extract(name, df_winners.CAND_NAME, limit = 1, scorer=fuzz.token_set_ratio)
          
        #if the match is deemed 100% (typically due to just periods, abbreviations), consider them a winner
        if ratio[0][1] == 100:
            df_candidates.loc[(df_candidates['CAND_NAME'] == name),'WINNER'] = 1
    
    ##FILTERING CANDIDATES
    #steps to filter out races we don't have a winner; use the unique office position to filter
    df_candidates['OFFICE_CONCAT'] = df_candidates['CAND_OFFICE_ST']+df_candidates['CAND_OFFICE']+df_candidates['CAND_OFFICE_DISTRICT']
    office_list = [office for office in df_candidates.loc[df_candidates['WINNER'] == 1].OFFICE_CONCAT.unique()]
    
    df_final = df_candidates.loc[df_candidates['OFFICE_CONCAT'].isin(office_list)]
    
    return df_final
    

In [8]:
df_2018 = identify_candidates(df_candidate, '2018', '2020')

In [9]:
df_2018.groupby('OFFICE_CONCAT').count()

Unnamed: 0_level_0,CAND_ID,CAND_NAME,CAND_PTY_AFFILIATION,CAND_ELECTION_YR,CAND_OFFICE_ST,CAND_OFFICE,CAND_OFFICE_DISTRICT,CAND_ICI,CAND_STATUS,CAND_PCC,CAND_ST1,CAND_ST2,CAND_CITY,CAND_ST,CAND_ZIP,WINNER
OFFICE_CONCAT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AKH00,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
ALH01,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
ALH02,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
ALH03,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
ALH04,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WIH08,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
WVH01,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
WVH02,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
WVH03,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13


### Step 3: CONCATENATE Multiple Years of Data

In [10]:
temp_df2018 = identify_candidates(df_candidate,'2018','2020')
temp_df2016 = identify_candidates(df_candidate,'2016','2018')

df_combined = pd.concat([temp_df2016,temp_df2018])

In [30]:
df_combined.sort_values(by=['WINNER'], ascending=True)

Unnamed: 0,CAND_ID,CAND_NAME,CAND_PTY_AFFILIATION,CAND_ELECTION_YR,CAND_OFFICE_ST,CAND_OFFICE,CAND_OFFICE_DISTRICT,CAND_ICI,CAND_STATUS,CAND_PCC,CAND_ST1,CAND_ST2,CAND_CITY,CAND_ST,CAND_ZIP,WINNER,OFFICE_CONCAT
0,H0CA15148,"HONDA, MIKE",DEM,2016,CA,H,17,I,P,C00351379,"C/O CONTRIBUTION SOLUTIONS, LLC","1346 THE ALAMEDA, STE. 7-380",SAN JOSE,CA,95126,0,CAH17
1090,H8TX17123,"STURM, SCOTT 1974",DEM,2018,TX,H,17,C,N,C00647867,328 CARDINAL DRIVE,,NEW BRAUNFELS,TX,78130,0,TXH17
1089,H8TX16141,"CHAVEZ, NORMA PRISCILLA MS.",DEM,2018,TX,H,16,O,P,C00663443,824 BOLIVIA ST,,EL PASO,TX,79903,0,TXH16
1088,H8TX16133,"SEEBERGER, RICK",REC,2018,TX,H,16,O,P,C00662379,6767 GATO ROAD,,EL PASO,TX,79932,0,TXH16
1087,H8TX16117,"CARRILLO, JOHN RENE",DEM,2018,TX,H,16,O,N,C00655100,9333 SHAVER DR.,,EL PASO,TX,79925,0,TXH16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1210,H2TX23082,"CUELLAR, HENRY",DEM,2016,TX,H,28,I,C,C00371302,1519 WASHINGTON STREET,SUITE 200,LAREDO,TX,78040,1,TXH28
1212,H2TX26093,"BURGESS, MICHAEL C. DR.",REP,2016,TX,H,26,I,C,C00372532,PO BOX 2334,,DENTON,TX,762022334,1,TXH26
1214,H2TX27190,"VELA, FILEMON MR.",DEM,2016,TX,H,34,I,C,C00513531,275 CALLE JACARANDA ST,,BROWNSVILLE,TX,78520,1,TXH34
1201,H2TX03118,"JOHNSON, SAM MR.",REP,2016,TX,H,03,I,C,C00250720,7105 HAVENCREST,,PLANO,TX,75074,1,TXH03


In [12]:
df_combined.WINNER.sum()

867

In [13]:
len(df_combined)

5517

### Step 4: CONCATENATE Candidate-committee linkages


In [55]:
#updated based on the file structure in the repo
df_candidate_ccl20 = parse_file('./fec_data/ccl/ccl20.txt', './fec_data/ccl/ccl_header_file.csv')
df_candidate_ccl18 = parse_file('./fec_data/ccl/ccl18.txt', './fec_data/ccl/ccl_header_file.csv')
df_candidate_ccl16 = parse_file('./fec_data/ccl/ccl16.txt', './fec_data/ccl/ccl_header_file.csv')


dfs_ccl = [df_candidate_ccl20, df_candidate_ccl18, df_candidate_ccl16]

df_candidate_ccl = reduce(DataFrame.unionAll, dfs_ccl)
df_candidate_ccl = df_candidate_ccl.select('*').toPandas() 
#converting to a pandas DF for ease of working with
print(type(df_candidate_ccl))

<class 'pandas.core.frame.DataFrame'>


In [15]:
len(df_candidate_ccl)

20439

In [45]:
#df_candidate_ccl.collect()

In [17]:
df_combined.merge(df_candidate_ccl,how='outer',left_on=['CAND_ID'],right_on=['CAND_ID'])


Unnamed: 0,CAND_ID,CAND_NAME,CAND_PTY_AFFILIATION,CAND_ELECTION_YR_x,CAND_OFFICE_ST,CAND_OFFICE,CAND_OFFICE_DISTRICT,CAND_ICI,CAND_STATUS,CAND_PCC,...,CAND_ST,CAND_ZIP,WINNER,OFFICE_CONCAT,CAND_ELECTION_YR_y,FEC_ELECTION_YR,CMTE_ID,CMTE_TP,CMTE_DSGN,LINKAGE_ID
0,H0CA15148,"HONDA, MIKE",DEM,2016,CA,H,17,I,P,C00351379,...,CA,95126,0.0,CAH17,2016,2020,C00351379,H,P,222331
1,H0CA15148,"HONDA, MIKE",DEM,2016,CA,H,17,I,P,C00351379,...,CA,95126,0.0,CAH17,2016,2018,C00351379,H,P,212473
2,H0CA15148,"HONDA, MIKE",DEM,2016,CA,H,17,I,P,C00351379,...,CA,95126,0.0,CAH17,2016,2016,C00351379,H,P,201923
3,H0CT02140,"NOVAK, DARIA IRENE",REP,2016,CT,H,02,C,P,C00589713,...,CT,06443,0.0,CTH02,2016,2020,C00589713,H,P,223315
4,H0CT02140,"NOVAK, DARIA IRENE",REP,2016,CT,H,02,C,P,C00589713,...,CT,06443,0.0,CTH02,2016,2018,C00589713,H,P,212507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23998,S8SD00024,,,,,,,,,,...,,,,,2014,2016,C00554519,S,P,205724
23999,S8VA00222,,,,,,,,,,...,,,,,2008,2016,C00435131,S,P,205728
24000,S8VA00230,,,,,,,,,,...,,,,,2012,2016,C00494187,S,P,205729
24001,S8VA00255,,,,,,,,,,...,,,,,2008,2016,C00441261,S,P,205730


### Step 5: CONCATENATE Contributions from committees to candidates & individual expenditures


In [51]:
#updated based on the file structure in the repo
df_candidate_ccl20 = parse_file('./fec_data/pas2/pas220.txt', './fec_data/pas2/pas2_header_file.csv')
df_candidate_ccl18 = parse_file('./fec_data/pas2/pas218.txt', './fec_data/pas2/pas2_header_file.csv')
df_candidate_ccl16 = parse_file('./fec_data/pas2/pas216.txt', './fec_data/pas2/pas2_header_file.csv')

dfs_com = [df_candidate_ccl20, df_candidate_ccl18, df_candidate_ccl16]

dfs_com = reduce(DataFrame.unionAll, dfs_com)
#ccl_sub = df_candidate_ccl.select(['CMTE_ID', 'CITY', 'STATE','ZIP_CODE','TRANSACTION_AMT', 'TRANSACTION_DT'])
#ccl_sub.head(30)
t = dfs_com.select('*').toPandas() 

print(t.head())
#converting to a pandas DF for ease of working with
#df_candidate_ccl = ccl_sub.select('*').toPandas() 

     CMTE_ID AMNDT_IND RPT_TP TRANSACTION_PGI           IMAGE_NUM  \
0  C00567180         T    TER           P2020  201901099143774199   
1  C00104885         A    TER           G2020  201901289144031511   
2  C00104885         A    TER           P2022  201901289144031512   
3  C00104885         A    TER           P2020  201901289144031511   
4  C00688408         T    TER           G2018  201901319144305867   

  TRANSACTION_TP ENTITY_TP                    NAME         CITY STATE  ...  \
0            24K       PAC   TED YOHO FOR CONGRESS  GAINESVILLE    FL  ...   
1            24K       CCM         TEAM GRAHAM INC     COLUMBIA    SC  ...   
2            24K       CCM    TIM SCOTT FOR SENATE   CHARLESTON    SC  ...   
3            24K       CCM  FRIENDS OF JIM CLYBURN     COLUMBIA    SC  ...   
4            24E       ORG        JACKSON ADVOCATE      JACKSON    MS  ...   

  OCCUPATION TRANSACTION_DT TRANSACTION_AMT   OTHER_ID    CAND_ID     TRAN_ID  \
0                  01082019        

In [56]:
t = t.merge(df_candidate_ccl,how='outer',left_on=['CMTE_ID'],right_on=['CMTE_ID '])
print(t.head(5))

KeyError: 'CMTE_ID '

In [19]:
print(df_candidate_ccl)

UnionRDD[53] at union at NativeMethodAccessorImpl.java:0


In [None]:
df_candidate_ccl.sort_values(by=['TRANSACTION_AMT'], ascending=False)