# data_to_analysis.py

- From Data to Matched Analysis

This code chunk contains to processes. First, we go from the datasets to a match with a certain election, to a match with the nearest winner. That dataset is then used to find the wealths of those politicians, and then we can compute an estimate of the average winner and average loser wealth. 

- From Data to Unconditional Analysis

Secondly, we start again from the datasets, and we go to margins. This dataset is merged with a dataset of politicians with the margins (conditional on the margin being <$x$%, but not necessarily from the same elections. 

In [218]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## From Data to Matched Analysis

First step: 

- Import the data

In [246]:
data1 = pd.read_csv("../Administration/new_data_entry_file.csv").iloc[:,1:]
data2 = pd.read_csv("../Administration/new_data_entry_file_lessclose.csv")

nonpoliticians = pd.concat([data1, data2])
nonpoliticians = nonpoliticians[nonpoliticians['Gevonden'] == 'ja']
nonpoliticians['jaar'] = nonpoliticians['Verkiezingdatum'].astype(str).str.split('/').apply(lambda x : x[2])


Second step:

- Import the `allelected.csv` dataset
- Then, left merge the nonpoliticians data with the `allected.csv` data 
    - This choice: we leave 'double' matches in the dataset: one nonpolitician can have two or more politician matches
    - Detailed approach: first, match on exact election date
    - If that doesn't work, match on election year

In [321]:
def decode_accents(name):
    try:
        return name.encode('latin-1').decode('raw_unicode_escape').encode('latin-1').decode('utf-8')
    except:
        return name

In [322]:
#allelected = pd.read_csv("../Data/elections/allelected.csv", encoding='latin1')#
all_candidates_elections = pd.read_csv("../Data/elections/election_results_details.csv", encoding='utf-8-sig')
all_candidates_elections["Naam"] = all_candidates_elections["Naam"].apply(lambda x : decode_accents(x))
print(all_candidates_elections.shape)
all_candidates_elections.head(2)

Unnamed: 0.1,Unnamed: 0,Naam,Aanbevolen door,Aantal stemmen,Procentueel,District,Verkiezingdatum,Type,Omvang electoraat,Opkomst,Aantal stembriefjes,Aantal stemmen geldig,Aantal stemmen blanco,Aantal zetels,Kiesdrempel
0,0,mr. B. Wichers,,700,83.73%,Groningen,30/11/1848,algemeen,1191,838,838,836,0,1,418
1,1,W.L. de Sturler,,34,4.07%,Groningen,30/11/1848,algemeen,1191,838,838,836,0,1,418


In [323]:
allelected = pd.read_csv("../Data/elections/allelected.csv", encoding='latin1')
print(allelected.shape)
allelected.head(2)

Unnamed: 0,achternaam,voornaam,tussenvoegsel,jaar,maand,dag,type verkiezing,districtsnaam,aantal stemmen,omvang_electoraat,zetels,drempel
0,Aalberse,P.J.M.,,1903,2,18,tussentijds,Almelo,3821,7865,1.0,2953.0
1,Aalberse,P.J.M.,,1905,6,16,algemeen,Almelo,5217,9324,1.0,3922.0


In [324]:
allelected["naam"] = allelected.apply(lambda row : row["voornaam"] + " " + row["achternaam"], axis=1)
allelected["naam"] = allelected["naam"].apply(lambda x : x.replace("  ", " "))
allelected.head(2)

Unnamed: 0,achternaam,voornaam,tussenvoegsel,jaar,maand,dag,type verkiezing,districtsnaam,aantal stemmen,omvang_electoraat,zetels,drempel,naam
0,Aalberse,P.J.M.,,1903,2,18,tussentijds,Almelo,3821,7865,1.0,2953.0,P.J.M. Aalberse
1,Aalberse,P.J.M.,,1905,6,16,algemeen,Almelo,5217,9324,1.0,3922.0,P.J.M. Aalberse


In [325]:
all_candidates_elections["jaar"] = all_candidates_elections["Verkiezingdatum"].apply(lambda x : int(x.split("/")[2]))
all_candidates_elections["maand"] = all_candidates_elections["Verkiezingdatum"].apply(lambda x : int(x.split("/")[1]))
all_candidates_elections["dag"] = all_candidates_elections["Verkiezingdatum"].apply(lambda x : int(x.split("/")[0]))
all_candidates_elections.head(2)

Unnamed: 0.1,Unnamed: 0,Naam,Aanbevolen door,Aantal stemmen,Procentueel,District,Verkiezingdatum,Type,Omvang electoraat,Opkomst,Aantal stembriefjes,Aantal stemmen geldig,Aantal stemmen blanco,Aantal zetels,Kiesdrempel,jaar,maand,dag
0,0,mr. B. Wichers,,700,83.73%,Groningen,30/11/1848,algemeen,1191,838,838,836,0,1,418,1848,11,30
1,1,W.L. de Sturler,,34,4.07%,Groningen,30/11/1848,algemeen,1191,838,838,836,0,1,418,1848,11,30


In [279]:
election_to_winners = {}
for i in tqdm(range(len(all_candidates_elections))):
    row = all_candidates_elections.iloc[i]
    district = row["District"]
    year = row["jaar"]
    month = row["maand"]
    day = row["dag"]
    if (district, year, month, day) in election_to_winners:
        continue
    district_winners = allelected[allelected["districtsnaam"] == district]
    year_winners = district_winners[district_winners["jaar"] == year]
    month_winners = year_winners[year_winners["maand"] == month]
    winners = month_winners[month_winners["dag"] == day]
    election_to_winners[(district, year, month, day)] = winners

100%|██████████| 8238/8238 [00:10<00:00, 808.87it/s] 


In [272]:
election_to_winners[("Groningen", 1848, 11, 30)]

Unnamed: 0,achternaam,voornaam,tussenvoegsel,jaar,maand,dag,type verkiezing,districtsnaam,aantal stemmen,omvang_electoraat,zetels,drempel,naam
2410,Wichers,B.,,1848,11,30,algemeen,Groningen,700,1191,1.0,418.0,B. Wichers


In [327]:
name_in_winners = []
for i in tqdm(range(len(all_candidates_elections))):
    row = all_candidates_elections.iloc[i]
    district = row["District"]
    year = row["jaar"]
    month = row["maand"]
    day = row["dag"]
    winners = election_to_winners[(district, year, month, day)]
    name = row["Naam"]
    winner_names = winners["naam"].values
    if any(name in n for n in winner_names) or any(n in name for n in winner_names):
        name_in_winners.append(True)
    else:
        name_in_winners.append(False)
all_candidates_elections = all_candidates_elections.assign(name_in_winners = name_in_winners)
all_candidates_elections.head(2)

100%|██████████| 8238/8238 [00:01<00:00, 4279.70it/s]


Unnamed: 0.1,Unnamed: 0,Naam,Aanbevolen door,Aantal stemmen,Procentueel,District,Verkiezingdatum,Type,Omvang electoraat,Opkomst,Aantal stembriefjes,Aantal stemmen geldig,Aantal stemmen blanco,Aantal zetels,Kiesdrempel,jaar,maand,dag,name_in_winners
0,0,mr. B. Wichers,,700,83.73%,Groningen,30/11/1848,algemeen,1191,838,838,836,0,1,418,1848,11,30,True
1,1,W.L. de Sturler,,34,4.07%,Groningen,30/11/1848,algemeen,1191,838,838,836,0,1,418,1848,11,30,False


In [330]:
matched_candidate_names = set(all_candidates_elections[all_candidates_elections["name_in_winners"]]["Naam"])
missed_winners = []
for i, row in tqdm(allelected.iterrows()):
    winner_name = row["naam"]
    if not any(winner_name in n for n in matched_candidate_names) and not any(n in winner_name for n in matched_candidate_names):
        missed_winners.append(i)
missed_winners_df = allelected[allelected.index.isin(missed_winners)]
missed_winners_df

2508it [00:00, 6139.40it/s]


Unnamed: 0,achternaam,voornaam,tussenvoegsel,jaar,maand,dag,type verkiezing,districtsnaam,aantal stemmen,omvang_electoraat,zetels,drempel,naam
407,Bylandt van Mariënweerd,O.W.A. graaf van,,1849,7,10,herstemming,Zaltbommel,431,935,1.0,307.0,O.W.A. graaf van Bylandt van Mariënweerd
439,Costerus,P.J.,,1848,12,28,herstemming,Sneek,354,1168,1.0,343.0,P.J. Costerus
1062,Hoytema,W.J. van,,1849,8,6,tussentijds,Zaltbommel,279,935,1.0,267.0,W.J. van Hoytema
1439,Lycklama à Nijeholt,W.H.,,1858,6,8,periodiek,Sneek,700,2842,1.0,662.0,W.H. Lycklama à Nijeholt
1440,Lycklama à Nijeholt,W.H.,,1862,6,24,herstemming,Sneek,1025,2859,1.0,953.0,W.H. Lycklama à Nijeholt
2187,Thomassen à Thuessink van der Hoop,A.J.,,1879,6,24,herstemming,Steenwijk,711,1539,1.0,678.0,A.J. Thomassen à Thuessink van der Hoop
2188,Thomassen à Thuessink van der Hoop,G.H.,,1882,4,11,tussentijds,Steenwijk,718,1591,1.0,672.0,G.H. Thomassen à Thuessink van der Hoop
2189,Thomassen à Thuessink van der Hoop,G.H.,,1883,6,12,periodiek,Steenwijk,772,1646,1.0,717.0,G.H. Thomassen à Thuessink van der Hoop
2190,Thomassen à Thuessink van der Hoop,G.H.,,1884,10,28,algemeen,Steenwijk,719,1631,1.0,650.0,G.H. Thomassen à Thuessink van der Hoop
2294,Villers de Pité,L.L.G.M. de,,1849,1,29,tussentijds,Heerlen,200,759,1.0,194.0,L.L.G.M. de Villers de Pité


In [328]:
all_candidates_elections[all_candidates_elections["name_in_winners"]]

Unnamed: 0.1,Unnamed: 0,Naam,Aanbevolen door,Aantal stemmen,Procentueel,District,Verkiezingdatum,Type,Omvang electoraat,Opkomst,Aantal stembriefjes,Aantal stemmen geldig,Aantal stemmen blanco,Aantal zetels,Kiesdrempel,jaar,maand,dag,name_in_winners
0,0,mr. B. Wichers,,700,83.73%,Groningen,30/11/1848,algemeen,1191,838,838,836,0,1,418,1848,11,30,True
11,11,G. Reinders,,559,77.42%,Onderdendam,11/12/1848,herstemming,1568,727,727,722,4,1,361,1848,12,11,True
18,18,J.F. Zijlker,,530,64.56%,Appingedam,11/12/1848,tussentijds,1586,831,831,821,7,1,411,1848,12,11,True
26,26,mr. B. Albarda,,341,52.87%,Dokkum,30/11/1848,algemeen,984,650,650,645,5,1,323,1848,11,30,True
29,29,I.T. ter Brugghen Hugenholtz,,336,61.31%,Dokkum,19/12/1848,tussentijds,984,548,550,548,2,1,274,1848,12,19,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8226,8226,A. Colijn jr.,,5206,73.42%,Enkhuizen,21/08/1917,tussentijds,11474,-,7193,7091,102,1,3546,1917,8,21,True
8229,8229,H.G.M. Hermans,,4368,83.69%,Gulpen,06/06/1918,tussentijds,12477,-,5430,5219,211,1,2610,1918,6,6,True
8232,8232,Æ. baron van Mackay,,4305,54.80%,Ommen,28/10/1912,herstemming,8667,-,7909,7856,53,1,3928,1912,10,28,True
8234,8234,jhr.mr. D.J. de Geer,,3667,53.31%,Schiedam,22/10/1907,herstemming,8083,-,6916,6878,38,1,3439,1907,10,22,True


In [329]:
len(allelected)

2508

In [245]:
# Clean the data a little bit

#allelected['maand'] = np.where(allelected['maand'] < 10, '0' + allelected['maand'].astype(str), allelected['maand'])
#allelected['dag'] = np.where(allelected['dag'] < 10, '0' + allelected['dag'].astype(str), allelected['dag'])
#allelected["Verkiezingdatum"] = allelected.dag.astype(str).str.cat(others=[allelected.maand.astype(str), allelected.jaar.astype(str)], sep='/')
#allelected.rename(columns={'districtsnaam':'District'}, inplace = True)

Then, the task is to find the politician `b1_nummer` in the `politician_names` dataset, and subsequently, wealth on the `wealth.csv` dataset