## Election & Gender
I am looking at some general stats regarding elections and gender.

In [1]:
import os
import re

import pandas as pd
import numpy as np

from IPython.display import display

## Load Election Data

In [5]:
os.listdir("var/data")

['0717-182.zip',
 '1976-2018-house2.csv',
 '1976-2018-senate.csv',
 'codebook-us-house-1976–2018.md',
 'codebook-us-senate-1976–2018.md',
 'howarder-gender-by-name.zip',
 'name_gender.csv']

In [6]:
house_results = pd.read_csv("var/data/1976-2018-house2.csv", encoding="latin_1")
senate_results = pd.read_csv("var/data/1976-2018-senate.csv", encoding="latin_1")
name_gender = pd.read_csv("var/data/name_gender.csv")

## Show a Few Data Samples

In [27]:
display(name_gender.sample(3))
display(house_results.sample(3))
display(senate_results.sample(3))

Unnamed: 0,name,gender,probability
47449,Kently,F,1.0
53042,Laylonni,F,1.0
27341,Emilio,M,0.99842


Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,runoff,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
11441,1994,Illinois,IL,17,33,21,US House,11,gen,False,False,,,True,total,4,160395,False,20200424
5929,1984,Pennsylvania,PA,42,23,14,US House,11,gen,False,False,Paul E. Kanjorski,democrat,False,total,108430,185122,False,20200424
17333,2002,Florida,FL,12,59,43,US House,11,gen,False,False,Jim Davis,democrat,False,total,1,1,False,20200424


Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
2653,2010,Vermont,VT,50,13,6,US Senate,statewide,gen,False,Daniel Freilich,independent,False,total,3544,235178,False,20171011.0
1039,1992,Connecticut,CT,9,16,1,US Senate,statewide,gen,False,Christopher J. Dodd,democrat,False,total,577662,1500661,False,20171011.0
577,1984,Maine,ME,23,11,2,US Senate,statewide,gen,False,Elizabeth H. Mitchell,democrat,False,total,142626,551406,False,20171011.0


In [28]:
def filter_standard_election(df):
    return df[
        (df.stage == 'gen') &
        (df.runoff == False) &
        (df.special == False) &
        (df.unofficial == False)
    ]

In [35]:
election_results_std = filter_standard_election(pd.concat([house_results, senate_results], axis=0, ignore_index=True))

In [36]:
election_results_std.sample(3)

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,runoff,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
16854,2000,Texas,TX,48,74,49,US House,15,gen,False,False,Rub_n Hinojosa,democrat,False,total,106570,120448,False,20200424.0
10190,1992,Michigan,MI,26,34,23,US House,3,gen,False,False,Richard Whitelock,libertarian,False,total,3232,264948,False,20200424.0
20840,2006,Pennsylvania,PA,42,23,14,US House,12,gen,False,False,John P. Murtha,democrat,False,total,123472,203163,False,20200424.0


In [None]:
house_results_cl = house_results[
    (house_results.stage == 'gen') &
    (house_results.runoff == False) &
    (house_results.special == False) &
    (house_results.unofficial == False)
].copy()

## Infer Gender of Candidate Based on Name

In [12]:
def get_first_name(name):
    nps = []
    if not pd.isnull(name):
        for np in str(name).split():
            np = re.sub(r'[^a-zA-Z\-]', '', np)
            if len(np) > 1:
                nps.append(np)
    if len(nps) > 0:
        return nps[0]

def normalize_name(name):
    if name is None:
        return ''
    else:
        return re.sub(r'[^a-zA-Z\-]', '', name.lower())

class GenderLookup:
    
    def __init__(self, df):
        names = {}
        for _, row in df.iterrows():
            name = normalize_name(row['name'])
            if name not in names:
                names[name] = { 'gender': row.gender, 'probability': row.probability }
            else:
                print(f"WARN: duplicate name {name}")
        self.names = names
    
    def __call__(self, name):
        if name is not None:
            return self.names.get(normalize_name(name), None)
        
get_gender = GenderLookup(name_gender)

In [37]:
election_results_std['FirstName'] = election_results_std.candidate.apply(get_first_name)
election_results_std['GenderStat'] = election_results_std.FirstName.apply(get_gender)
election_results_std['Gender'] = election_results_std.GenderStat.apply(lambda gs: gs.get('gender', None) if gs is not None else None)

In [38]:
election_results_std['Gender'].value_counts(dropna=False).reset_index()

Unnamed: 0,index,Gender
0,M,16268
1,F,2468
2,,2176


## Vote Percent and Party Stats

I was looking to see if there is correlation between change in how a district typically votes and a change in gender.
Need to explore this more.

In [39]:
election_results_std['VotePercent'] = 100 * election_results_std['candidatevotes'] / house_results_cl['totalvotes']

In [40]:
def get_party_stats(df):
    return df.groupby('party').agg({'VotePercent': ['mean', 'std', 'size']})

vote_percents = (election_results_std
    .groupby(['state', 'district'])
    .apply(get_party_stats)
    .reset_index()
    .rename(columns={'VotePercent': 'VotePercentStats'})
)

vote_percents.columns = ["-".join(cp for cp in c if len(cp) > 0) for c in vote_percents.columns]

In [41]:
election_results_std = pd.merge(
    election_results_std,
    vote_percents,
    on=['state', 'district', 'party']
)

In [44]:
election_results_std['VotePercentDiffDistrictPartyAvg'] = election_results_std['VotePercent'] - election_results_std['VotePercentStats-mean']

In [47]:
def get_gender_no(g):
    if g == 'M':
        return 1
    elif g == 'F':
        return 0
    else:
        return None
    
election_results_std['GenderNo'] = election_results_std.Gender.apply(get_gender_no)

In [48]:
election_results_std[election_results_std.GenderNo.notnull()][['GenderNo', 'VotePercentDiffDistrictPartyAvg']].corr()

Unnamed: 0,GenderNo,VotePercentDiffDistrictPartyAvg
GenderNo,1.0,0.018525
VotePercentDiffDistrictPartyAvg,0.018525,1.0
