In [1]:
# imports and loading clean data
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt

from IPython.display import display, Markdown

from modules.lv_utils import load_households, load_voters
from modules.ms1_utils import clean_df, show_vote_rate_and_summary, two_sample_perm_test_diff_frac_votes
from modules.ms1_utils import get_two_sample_ns, plot_hist_vote_rate_vs_field

# load the data
households = load_households('data_clean/20180725_fullset_households_district3.csv')
voters = load_voters('data_clean/20180725_fullset_voters_district3.csv')
elections = pd.read_csv('data_clean/20180621_election_data.csv')

In [2]:
voters.shape

(13307, 148)

In [3]:
print(voters.columns[0:75])
print(voters.columns[75:])

Index(['Vid', 'Abbr', 'Precinct', 'PrecinctSub', 'Party', 'PartyMain',
       'RegDate', 'PAV', 'RegDateOriginal', 'E6_110816', 'E5_060716',
       'E4_110414', 'E3_060314', 'E2_110612', 'E1_060512', 'District',
       'VScore', 'VScorePos', 'VScorePct', 'BirthYear',
       'OldestInHouseBirthYear', 'IsOldestInHouse', 'havePhone',
       'BirthPlaceState', 'BirthPlaceStateRegion', 'BirthPlaceCountry',
       'BirthPlaceCountryRegion', 'Gender', 'sameMailAddress', 'MailCountry',
       'isApt', 'Zip', 'StreetType', 'EmailProvider', 'E5_060716BT',
       'E1_060512BT', 'Hid', 'cHid', 'E34_nVotesPos', 'E34_nVotes',
       'E34_nVotesPct', 'E56_nVotesPos', 'E56_nVotes', 'E56_nVotesPct',
       'E78_nVotesPos', 'E78_nVotes', 'E78_nVotesPct', 'E12_nVotesPos',
       'E12_nVotes', 'E12_nVotesPct', 'E14_nVotesPos', 'E14_nVotes',
       'E14_nVotesPct', 'E16_nVotesPos', 'E16_nVotes', 'E16_nVotesPct',
       'Eap_nVotesPos', 'Eap_nVotes', 'Eap_nVotesPct', 'Eag_nVotesPos',
       'Eag_nVotes', 'E

In [33]:
voters.loc[:5,['E1_GndTth', 'E2_GndTth', 'E3_GndTth', 'E4_GndTth','E5_GndTth', 'E6_GndTth']]

Unnamed: 0,E1_GndTth,E2_GndTth,E3_GndTth,E4_GndTth,E5_GndTth,E6_GndTth
0,1.0,1.0,1.0,1.0,1.0,1.0
1,-1.0,1.0,0.0,1.0,1.0,1.0
2,1.0,1.0,0.0,1.0,1.0,1.0
3,0.0,0.0,0.0,0.0,-1.0,0.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [44]:
e_cols = ['E6_110816', 'E5_060716','E4_110414', 'E3_060314', 'E2_110612', 'E1_060512']
gdt_cols = ['E1_GndTth', 'E2_GndTth', 'E3_GndTth', 'E4_GndTth','E5_GndTth', 'E6_GndTth']
cols = e_cols[::-1]+gdt_cols
dfw = voters.loc[:5,cols+['Vid', 'Party', 'PartyMain', 'RegDate', 'PAV',]]
dfw

Unnamed: 0,E1_060512,E2_110612,E3_060314,E4_110414,E5_060716,E6_110816,E1_GndTth,E2_GndTth,E3_GndTth,E4_GndTth,E5_GndTth,E6_GndTth,Vid,Party,PartyMain,RegDate,PAV
0,V,V,V,V,A,A,1.0,1.0,1.0,1.0,1.0,1.0,0,DEM,DEM,1992-10-05,Y
1,,V,N,V,A,A,-1.0,1.0,0.0,1.0,1.0,1.0,1,DEM,DEM,2012-11-06,Y
2,A,V,N,A,A,A,1.0,1.0,0.0,1.0,1.0,1.0,2,DEM,DEM,2012-11-06,Y
3,N,N,N,N,,N,0.0,0.0,0.0,0.0,-1.0,0.0,3,NPP,NPP,2016-09-15,N
4,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4,DEM,DEM,2018-02-13,Y
5,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,5,REP,REP,2018-01-11,Y


In [45]:
# relevant question on unstack rather than stack
#https://stackoverflow.com/questions/33057946/unstack-or-pivot-only-some-columns

In [59]:
ele = ['E1','E2','E3','E4','E5','E6']
dfw_data = dfw.drop(columns=cols)
dfw_data.index.rename('vid', inplace=True)

dfw_elecs = dfw[e_cols[::-1]]
dfw_elecs.index.rename('vid', inplace=True)
dfw_elecs.columns = ele
#dfw_elecs.columns.name = 'vote'

dfw_gdt = dfw[gdt_cols]
dfw_gdt.index.rename('vid', inplace=True)
dfw_gdt.columns = ele
#dfw_gdt.columns.name = 'gndtth'

display(dfw_data)
display(dfw_elecs)
display(dfw_gdt)

Unnamed: 0_level_0,Vid,Party,PartyMain,RegDate,PAV
vid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,DEM,DEM,1992-10-05,Y
1,1,DEM,DEM,2012-11-06,Y
2,2,DEM,DEM,2012-11-06,Y
3,3,NPP,NPP,2016-09-15,N
4,4,DEM,DEM,2018-02-13,Y
5,5,REP,REP,2018-01-11,Y


Unnamed: 0_level_0,E1,E2,E3,E4,E5,E6
vid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,V,V,V,V,A,A
1,,V,N,V,A,A
2,A,V,N,A,A,A
3,N,N,N,N,,N
4,,,,,,
5,,,,,,


Unnamed: 0_level_0,E1,E2,E3,E4,E5,E6
vid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0
1,-1.0,1.0,0.0,1.0,1.0,1.0
2,1.0,1.0,0.0,1.0,1.0,1.0
3,0.0,0.0,0.0,0.0,-1.0,0.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [67]:
#pd.DataFrame(dfw_elecs.unstack(level=1).reorder_levels([1,0]))
ustk_e = pd.DataFrame(dfw_elecs.unstack(level=1))
ustk_e.columns = ['vote']
ustk_g = pd.DataFrame(dfw_gdt.unstack(level=1))
ustk_g.columns = ['gndtth']
#display(ustk_e)
#display(ustk_g)
ustk_e.join(ustk_g).join(dfw_data)

Unnamed: 0_level_0,Unnamed: 1_level_0,vote,gndtth,Vid,Party,PartyMain,RegDate,PAV
Unnamed: 0_level_1,vid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
E1,0,V,1.0,0,DEM,DEM,1992-10-05,Y
E1,1,,-1.0,1,DEM,DEM,2012-11-06,Y
E1,2,A,1.0,2,DEM,DEM,2012-11-06,Y
E1,3,N,0.0,3,NPP,NPP,2016-09-15,N
E1,4,,-1.0,4,DEM,DEM,2018-02-13,Y
E1,5,,-1.0,5,REP,REP,2018-01-11,Y
E2,0,V,1.0,0,DEM,DEM,1992-10-05,Y
E2,1,V,1.0,1,DEM,DEM,2012-11-06,Y
E2,2,V,1.0,2,DEM,DEM,2012-11-06,Y
E2,3,N,0.0,3,NPP,NPP,2016-09-15,N


# Adding in ground Truth

In [9]:
rawvs = ['E6_110816', 'E5_060716','E4_110414', 'E3_060314', 'E2_110612', 'E1_060512']
gtrs = ['E1_GndTth', 'E2_GndTth', 'E3_GndTth', 'E4_GndTth','E5_GndTth', 'E6_GndTth']
dfw = voters.loc[:5,rawvs[::-1]+gtrs+['Vid', 'Party', 'PartyMain', 'RegDate', 'PAV',]]
dfw

Unnamed: 0,E1_060512,E2_110612,E3_060314,E4_110414,E5_060716,E6_110816,E1_GndTth,E2_GndTth,E3_GndTth,E4_GndTth,E5_GndTth,E6_GndTth,Vid,Party,PartyMain,RegDate,PAV
0,V,V,V,V,A,A,1.0,1.0,1.0,1.0,1.0,1.0,0,DEM,DEM,1992-10-05,Y
1,,V,N,V,A,A,-1.0,1.0,0.0,1.0,1.0,1.0,1,DEM,DEM,2012-11-06,Y
2,A,V,N,A,A,A,1.0,1.0,0.0,1.0,1.0,1.0,2,DEM,DEM,2012-11-06,Y
3,N,N,N,N,,N,0.0,0.0,0.0,0.0,-1.0,0.0,3,NPP,NPP,2016-09-15,N
4,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4,DEM,DEM,2018-02-13,Y
5,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,5,REP,REP,2018-01-11,Y


In [10]:
dfw_data = dfw.drop(columns=rawvs+gtrs)
dfw_data.index.rename('vid', inplace=True)
dfw_elecs = dfw[rawvs[::-1]+gtrs]
dfw_elecs.index.rename('vid', inplace=True)
display(dfw_data)
display(dfw_elecs)

Unnamed: 0_level_0,Vid,Party,PartyMain,RegDate,PAV
vid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,DEM,DEM,1992-10-05,Y
1,1,DEM,DEM,2012-11-06,Y
2,2,DEM,DEM,2012-11-06,Y
3,3,NPP,NPP,2016-09-15,N
4,4,DEM,DEM,2018-02-13,Y
5,5,REP,REP,2018-01-11,Y


Unnamed: 0_level_0,E1_060512,E2_110612,E3_060314,E4_110414,E5_060716,E6_110816,E1_GndTth,E2_GndTth,E3_GndTth,E4_GndTth,E5_GndTth,E6_GndTth
vid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,V,V,V,V,A,A,1.0,1.0,1.0,1.0,1.0,1.0
1,,V,N,V,A,A,-1.0,1.0,0.0,1.0,1.0,1.0
2,A,V,N,A,A,A,1.0,1.0,0.0,1.0,1.0,1.0
3,N,N,N,N,,N,0.0,0.0,0.0,0.0,-1.0,0.0
4,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


Better blog/article: https://jakevdp.github.io/PythonDataScienceHandbook/03.05-hierarchical-indexing.html

#### reference code to create a multi index:

* multi_tuples =[(x,y) for x in ['local_total', 'delivery_duration'] for y in ['count','sum','% total']]
* multi_cols = pd.MultiIndex.from_tuples(multi_tuples)
* for_disp = pd.DataFrame(for_disp, columns=multi_cols)

#### code to flatten an index joining the index layer names together

* for_disp.columns = ['_'.join(col).strip() for col in for_disp.columns.values]


In [11]:
dfw_elecs.columns

Index(['E1_060512', 'E2_110612', 'E3_060314', 'E4_110414', 'E5_060716',
       'E6_110816', 'E1_GndTth', 'E2_GndTth', 'E3_GndTth', 'E4_GndTth',
       'E5_GndTth', 'E6_GndTth'],
      dtype='object')

In [12]:
elecs = ['E1', 'E2', 'E3', 'E4', 'E5', 'E6']
e1_cols = [col for col in dfw_elecs.columns if 'E1' in col]
# can remove the E1_ with
#e1_cols = [col[3:] for col in dfw_elecs.columns if 'E1' in col]
print(e1_cols)
multi_tuples =[(x,y) for x in ['E1'] for y in e1_cols]
multi_tuples

['E1_060512', 'E1_GndTth']


[('E1', 'E1_060512'), ('E1', 'E1_GndTth')]

In [13]:
index_tuples =[]
for e in elecs:
    e_cols = [col for col in dfw_elecs.columns if e in col]
    # can remove the E1_ with
    #e1_cols = [col[3:] for col in dfw_elecs.columns if 'E1' in col]
    multi_tuple =[(x,y) for x in [e] for y in e_cols]
    index_tuples += multi_tuple
index_tuples

[('E1', 'E1_060512'),
 ('E1', 'E1_GndTth'),
 ('E2', 'E2_110612'),
 ('E2', 'E2_GndTth'),
 ('E3', 'E3_060314'),
 ('E3', 'E3_GndTth'),
 ('E4', 'E4_110414'),
 ('E4', 'E4_GndTth'),
 ('E5', 'E5_060716'),
 ('E5', 'E5_GndTth'),
 ('E6', 'E6_110816'),
 ('E6', 'E6_GndTth')]

In [14]:
multi_cols = pd.MultiIndex.from_tuples(index_tuples)
for_disp = pd.DataFrame(dfw_elecs, columns=multi_cols)
for_disp

Unnamed: 0_level_0,E1,E1,E2,E2,E3,E3,E4,E4,E5,E5,E6,E6
Unnamed: 0_level_1,E1_060512,E1_GndTth,E2_110612,E2_GndTth,E3_060314,E3_GndTth,E4_110414,E4_GndTth,E5_060716,E5_GndTth,E6_110816,E6_GndTth
vid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,,,,,,,,,,,,
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,,,,,,,,,,,,
5,,,,,,,,,,,,


In [15]:
dfw_elecs

Unnamed: 0_level_0,E1_060512,E2_110612,E3_060314,E4_110414,E5_060716,E6_110816,E1_GndTth,E2_GndTth,E3_GndTth,E4_GndTth,E5_GndTth,E6_GndTth
vid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,V,V,V,V,A,A,1.0,1.0,1.0,1.0,1.0,1.0
1,,V,N,V,A,A,-1.0,1.0,0.0,1.0,1.0,1.0
2,A,V,N,A,A,A,1.0,1.0,0.0,1.0,1.0,1.0
3,N,N,N,N,,N,0.0,0.0,0.0,0.0,-1.0,0.0
4,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [16]:
pd.DataFrame(dfw_elecs[['E1_060512','E1_GndTth']], columns=pd.MultiIndex.from_tuples([('E1', 'E1_060512'),
 ('E1', 'E1_GndTth')]))

Unnamed: 0_level_0,E1,E1
Unnamed: 0_level_1,E1_060512,E1_GndTth
vid,Unnamed: 1_level_2,Unnamed: 2_level_2
0,,
1,,
2,,
3,,
4,,
5,,


In [17]:
pd.MultiIndex.from_tuples(index_tuples)

MultiIndex(levels=[['E1', 'E2', 'E3', 'E4', 'E5', 'E6'], ['E1_060512', 'E1_GndTth', 'E2_110612', 'E2_GndTth', 'E3_060314', 'E3_GndTth', 'E4_110414', 'E4_GndTth', 'E5_060716', 'E5_GndTth', 'E6_110816', 'E6_GndTth']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]])

In [18]:
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
data

array([[31. , 36.7, 39. , 36.4, 24. , 37.5],
       [16. , 38.6, 30. , 37.6, 30. , 36.2],
       [32. , 37.2, 33. , 37.8, 54. , 36.2],
       [19. , 36.1, 34. , 35.4, 41. , 36.7]])

In [19]:
mul_df_ex = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
mul_df_ex

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.337645,0.55609
a,2,0.37577,0.94665
b,1,0.386929,0.197696
b,2,0.303291,0.664765


In [20]:
np.random.rand(4, 2)

array([[0.78648313, 0.03692772],
       [0.1985399 , 0.24926799],
       [0.66068285, 0.35841471],
       [0.94945341, 0.09824871]])

In [21]:
mul_dfw = pd.DataFrame(dfw_elecs.values,
                        columns=[elecs+elecs, ['vote']*6 +['GndTth']*6])
mul_dfw

Unnamed: 0_level_0,E1,E2,E3,E4,E5,E6,E1,E2,E3,E4,E5,E6
Unnamed: 0_level_1,vote,vote,vote,vote,vote,vote,GndTth,GndTth,GndTth,GndTth,GndTth,GndTth
0,V,V,V,V,A,A,1,1,1,1,1,1
1,,V,N,V,A,A,-1,1,0,1,1,1
2,A,V,N,A,A,A,1,1,0,1,1,1
3,N,N,N,N,,N,0,0,0,0,-1,0
4,,,,,,,-1,-1,-1,-1,-1,-1
5,,,,,,,-1,-1,-1,-1,-1,-1
