## elections_to_full_data

This code file departs from the election list. We implement several steps in this file. The first objective is to match the names of the elections with the names of the politicians, and also with the names of the non-politicians whose data we have collected. 

The first thing we do is load the packages, and then clean up the dataset and compute the margin of victory of all the politicians. 

In [1]:
# Load the libraries
import pandas as pd
import numpy as np
import re

from pandas_ods_reader import read_ods
import statistics

import numpy as np
import matplotlib.pyplot as plt

from matplotlib import pyplot as plt

from tqdm import tqdm

from functions import *

from thefuzz import fuzz
from thefuzz import process

In [2]:
# Import the election data
elected_people = pd.read_csv("../Data/elections/allelected.csv", encoding='latin-1')

In [3]:
elected_people['naam'] = elected_people['voornaam'] + ' ' + elected_people['achternaam']
elected_people['verkiezingdatum'] = (elected_people['dag'].astype(str) + 
                                     '-' + 
                                     elected_people['maand'].astype(str) +
                                     '-' +
                                     elected_people['jaar'].astype(str)
                                    )
elected_people['verkiezingdatum'] = elected_people.apply(lambda x: pd.Timestamp(year=x['jaar'], month=x['maand'], day=x['dag']), axis=1)

In [4]:
election_results_details = pd.read_csv("../Data/elections/election_results_details.csv").iloc[:,1:]
election_results_details['Verkiezingdatum'] = (election_results_details['Verkiezingdatum'].
                                               apply(lambda x: pd.Timestamp(x))
                                              )

def get_zetels(df):
    a = pd.to_numeric(df['Aantal zetels'], errors='coerce')
    b = np.mean(a)
    return (b)

aantal_zetels = (election_results_details.groupby(['District', 'Verkiezingdatum']).
                 apply(get_zetels).reset_index().rename(columns={0:'Aantal zetels'})
                )

In [5]:
#pd.merge(elected_people, aantal_zetels, 
#         left_on=['districtsnaam', 'verkiezingdatum'],
#         right_on=['District', 'Verkiezingdatum']).drop(columns=['District', 'Verkiezingdatum'])


In [6]:
all_candidates = pd.read_csv("../Data/elections/election_results_details.csv").iloc[:,1:]


all_candidates['Verkiezingdatum'] = all_candidates['Verkiezingdatum'].str.split("/")
all_candidates['Verkiezingdatum'] = all_candidates['Verkiezingdatum'].apply(lambda x: [int(y) for y in x])
all_candidates['Verkiezingdatum'] = all_candidates['Verkiezingdatum'].apply(
    lambda x: pd.Timestamp(day=x[0], month=x[1], year=x[2]) if all(type(y) == int for y in x)
    else None
)
all_candidates['Aantal zetels'] = all_candidates['Aantal zetels'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
all_candidates['Aantal stemmen'] = (all_candidates['Aantal stemmen'].
                                    apply(lambda x: pd.to_numeric(x, errors='coerce'))
                                   )

aantal_stemmen = (all_candidates.groupby(['District','Verkiezingdatum']).
 apply(lambda x: sum(x['Aantal stemmen'])).
 reset_index().
 rename(columns={0:'totaal aantal stemmen'})
)

all_candidates = pd.merge(all_candidates, aantal_stemmen,
        left_on=['District', 'Verkiezingdatum'],
        right_on=['District', 'Verkiezingdatum'])

all_candidates = (all_candidates.groupby(['District', 'Verkiezingdatum']).
 apply(lambda x: x.sort_values(['Aantal stemmen'], 
                               ascending = False))
).reset_index(drop=True)

all_candidates['hoeveelste_in_verkiezing'] = (all_candidates.groupby(['District', 'Verkiezingdatum']).
                                              cumcount() + 1)

all_candidates['gewonnen'] = np.where(all_candidates['hoeveelste_in_verkiezing'] <= all_candidates['Aantal zetels'], 1, 0)
all_candidates['marginal_winner'] = np.where(all_candidates['Aantal zetels'] - all_candidates['hoeveelste_in_verkiezing'] == 0, 1, 0)
all_candidates['marginal_loser'] = np.where(all_candidates['Aantal zetels'] - all_candidates['hoeveelste_in_verkiezing'] == -1, 1, 0)

In [7]:
all_candidates = get_margin(all_candidates)

100%|██████████| 8238/8238 [01:14<00:00, 110.32it/s]


In [8]:
#get_match(elected_people).to_csv("../Data/politician_data/key_allelected_to_all_candidates.csv", index = False)

In [9]:
key = pd.read_csv("../Data/politician_data/key_allelected_to_all_candidates.csv")

all_candidates = pd.merge(all_candidates, key, 
         how='left',
         left_on='Naam',
         right_on='name_in_all_elections')

cols_to_order = ['Naam', 'name_in_all_elections', 'name_in_elected_people']
new_columns = cols_to_order + (all_candidates.columns.drop(cols_to_order).tolist())

all_candidates = all_candidates[new_columns]

In [10]:
consequential_elections = []

for i in range(len(elected_people)):
    consequential_elections.append((elected_people['verkiezingdatum'].iloc[i], elected_people['districtsnaam'].iloc[i]))


In [13]:

def get_elec_stats(dataframe):
    
    out = pd.DataFrame()
    
    for i in tqdm(range(len(dataframe))):
        
        naam = dataframe.iloc[i]['Naam']
        datum = dataframe.iloc[i]['Verkiezingdatum']
        district = dataframe.iloc[i]['District']
                
        if (datum, district) in consequential_elections:
            consequential_election = 1
        else: 
            consequential_election = 0 
        
        # proberen
        hoeveelste_keer_prob = dataframe[
            (dataframe['Naam'] == naam) & 
            (dataframe['Verkiezingdatum'] <= datum)].shape[0]
        hoeveelste_keer_prob_alg = dataframe[
            (dataframe['Naam'] == naam) & 
            (dataframe['Verkiezingdatum'] <= datum) & 
            (dataframe['Type'] != 'herstemming')].shape[0]
          
        hoeveel_keer_prob_tot = dataframe[dataframe['Naam'] == naam].shape[0]
    
        last = dataframe[dataframe['Naam'] == naam].shape[0] - 1
        
        alle_verkiezingen_voor_deze_persoon = dataframe[(dataframe['Naam'] == naam)].sort_values('Verkiezingdatum')
        
        # probeer je het ook in andere districten in deze verkiezing?
        hoeveel_distr_deze_verk = dataframe[(dataframe['Naam'] == naam) & (dataframe['Verkiezingdatum'] == datum)].shape[0]
        
        # write here the new functions hoelang tussen 1 en laatst and hoelang tussen 1 and 2
        hoelang_tussen_1_en_laatst = alle_verkiezingen_voor_deze_persoon.iloc[last]['Verkiezingdatum'] - alle_verkiezingen_voor_deze_persoon.iloc[0]['Verkiezingdatum']
        
        try:
            hoelang_tussen_1_en_2 = (dataframe[(dataframe['Naam'] == naam) & 
                                               (dataframe['Type'] != 'herstemming')].
                                     iloc[1]['Verkiezingdatum'] - 
                                     dataframe[(dataframe['Naam'] == naam) 
                                               & (dataframe['Type'] != 'herstemming')].
                                     iloc[0]['Verkiezingdatum'])
            
        except:
            hoelang_tussen_1_en_2 = None
        
        
        # gewonnen
        
        if dataframe.iloc[i]['gewonnen'] == 1:
        
            hoeveelste_keer_gewonnen = dataframe[(dataframe['Naam'] == naam) & 
                                             (dataframe['gewonnen'] == 1) &
                                             (dataframe['Verkiezingdatum'] <= datum)].shape[0]
            hoeveelste_keer_gewonnen_alg = dataframe[(dataframe['Naam'] == naam) & 
                                                 (dataframe['gewonnen'] == 1) & 
                                                 (dataframe['Verkiezingdatum'] <= datum) & 
                                                 (dataframe['Type'] != 'herstemming')].shape[0]
        else:
            
            hoeveelste_keer_gewonnen = None
            hoeveelste_keer_gewonnen_alg = None
            
        # hoevaak gewonnen door elected_people
        hoeveelste_keer_gewonnen_tweedeproxy = elected_people[
            (elected_people['naam'] == dataframe.iloc[i]['name_in_elected_people']) &
            (elected_people['verkiezingdatum'] == datum)].shape[0]
        
        hoevaak_gewonnen_alltime = elected_people[elected_people['naam'] == dataframe.iloc[i]['name_in_elected_people']].shape[0]
            
        hoevaak_gewonnen_toekomst = elected_people[
            (elected_people['naam'] == dataframe.iloc[i]['name_in_elected_people']) &
            (elected_people['verkiezingdatum'] > datum)].shape[0]
        hoevaak_gewonnen_verleden = elected_people[
            (elected_people['naam'] == dataframe.iloc[i]['name_in_elected_people']) &
            (elected_people['verkiezingdatum'] < datum)].shape[0]
            
        # volgende verkiezingen (ook zoeken in elected_people)
        elections_participated_in_future = dataframe[(dataframe['Naam'] == naam) & 
                                                           (dataframe['Verkiezingdatum'] > datum) &
                                                           ((dataframe['Type'] == 'algemeen') | 
                                                            (dataframe['Type'] == 'periodiek'))]['Verkiezingdatum'].tolist()
        
        verk_2_10_gewonnen = [None, None, None, None, None, None, None, None, None]
        
        for j, val in enumerate(elections_participated_in_future):

                if j > 8:
                    continue

                cur_gewonnen = elected_people[
                    (elected_people['naam'] == dataframe.iloc[i]['name_in_elected_people']) 
                    & (elected_people['verkiezingdatum'] == val)].shape[0]
                verk_2_10_gewonnen[j] = cur_gewonnen               
        
        # put everything together in dataframe
        interim = dataframe.iloc[i:i+1]
        
        interim = interim.assign(consequential_election = consequential_election)
        interim = interim.assign(hoelang_tussen_1_en_2 = hoelang_tussen_1_en_2)
        interim = interim.assign(hoelang_tussen_1_en_laatst = hoelang_tussen_1_en_laatst)
        
        interim = interim.assign(hoeveel_distr_deze_verk = hoeveel_distr_deze_verk)
        
        interim = interim.assign(hoeveelste_keer_prob = hoeveelste_keer_prob)
        interim = interim.assign(hoeveelste_keer_prob_alg = hoeveelste_keer_prob_alg)
        
        interim = interim.assign(hoeveelste_keer_gewonnen = hoeveelste_keer_gewonnen)
        interim = interim.assign(hoeveelste_keer_gewonnen_alg = hoeveelste_keer_gewonnen_alg)
        interim = interim.assign(hoeveelste_keer_gewonnen_tweedeproxy = hoeveelste_keer_gewonnen_tweedeproxy)
        interim = interim.assign(hoevaak_gewonnen_alltime = hoevaak_gewonnen_alltime)
        interim = interim.assign(hoevaak_gewonnen_verleden = hoevaak_gewonnen_verleden)
        interim = interim.assign(hoevaak_gewonnen_toekomst = hoevaak_gewonnen_toekomst)
        interim = interim.assign(verk_2_10_gewonnen = [verk_2_10_gewonnen])
        
                                                         
        out = out.append(interim)
        
    return out
        

In [14]:
complete_elections_dataset = get_elec_stats(all_candidates)

100%|██████████| 8238/8238 [06:06<00:00, 22.45it/s]


In [15]:
complete_elections_dataset.head(5)

Unnamed: 0,Naam,name_in_all_elections,name_in_elected_people,Aanbevolen door,Aantal stemmen,Procentueel,District,Verkiezingdatum,Type,Omvang electoraat,...,hoeveel_distr_deze_verk,hoeveelste_keer_prob,hoeveelste_keer_prob_alg,hoeveelste_keer_gewonnen,hoeveelste_keer_gewonnen_alg,hoeveelste_keer_gewonnen_tweedeproxy,hoevaak_gewonnen_alltime,hoevaak_gewonnen_verleden,hoevaak_gewonnen_toekomst,verk_2_10_gewonnen
0,S.A. de Moraaz,S.A. de Moraaz,S.A. de Moraaz,,503.0,52.84%,Alkmaar,1848-11-30,algemeen,1107,...,1,1,1,1.0,1.0,1,2,0,1,"[1, 0, 0, None, None, None, None, None, None]"
1,G. van Leeuwen,,,,438.0,46.01%,Alkmaar,1848-11-30,algemeen,1107,...,1,1,1,,,0,0,0,0,"[None, None, None, None, None, None, None, Non..."
2,mr. H.J. Smit,mr. H.J. Smit,H.J. Smit,,1566.0,79.86%,Alkmaar,1850-08-27,algemeen,2833,...,1,2,2,2.0,2.0,1,3,1,1,"[1, None, None, None, None, None, None, None, ..."
3,S.A. de Moraaz,S.A. de Moraaz,S.A. de Moraaz,,1275.0,65.02%,Alkmaar,1850-08-27,algemeen,2833,...,1,2,2,2.0,2.0,1,2,1,0,"[0, 0, None, None, None, None, None, None, None]"
4,jhr.mr. C. van Foreest,jhr.mr. C. van Foreest,C. van Foreest,,685.0,34.93%,Alkmaar,1850-08-27,algemeen,2833,...,1,1,1,,,0,9,0,9,"[1, 0, 0, 1, 1, 0, 0, 1, None]"


## Matching:

I now have to match the names from the (ever) elected-people to the names in the PDC dataset. This way, I can merge the PDC data of two categories of people:

   - Politicians
   - Unsuccesful future or past politicians
    
Then, the only other candidates that have to be merged are the never succesful candidates, which are supposed to be an exact match. 

   - Check whether this is in fact true, whether all of these observations are to be found in the list of complete_elections_dataset.
   
In the file `data_to_analysis_unmatched.ipynb` is an alternative matching script. That notebook also contains the deflate function so as to go from raw wealth to deflated wealth. 

In [16]:
# Import politician data
politician_data = pd.read_excel("../Data/politician_data/tk_1815tot1950uu.xlsx", dtype={'b1-nummer':str})
politician_data['prepositie'] = politician_data['prepositie'].fillna('')
politician_data['naam'] = politician_data['voorletters'] + ' ' + politician_data['prepositie'] + ' ' + politician_data['achternaam']
politician_data['begin periode'] = politician_data['begin periode'].str.split("-")
politician_data['begin periode'] = politician_data['begin periode'].apply(lambda x: [int(y) for y in x])
politician_data['begin periode'] = politician_data['begin periode'].apply(
    lambda x: pd.Timestamp(day=x[2], month=x[1], year=x[0]) if all(type(y) == int for y in x)
    else None
)

politician_data['einde periode'] = politician_data['einde periode'].str.split("-")
politician_data['einde periode'] = politician_data['einde periode'].apply(lambda x: [int(y) for y in x])
politician_data['einde periode'] = politician_data['einde periode'].apply(
    lambda x: pd.Timestamp(day=x[2], month=x[1], year=x[0]) if all(type(y) == int for y in x)
    else None
)


In [17]:
aep = elected_people['naam'].unique().tolist()

In [59]:
#out = {}

#for i, name in enumerate(aep):
#    
#    first_participation = elected_people[elected_people['naam'] == name].sort_values('verkiezingdatum').iloc[0]['verkiezingdatum']
#    
#    candidate_matches = politician_data[
#        (politician_data['begin periode'] > (first_participation - pd.Timedelta(days=100))) &
#        (politician_data['einde periode'] > first_participation + pd.Timedelta(days=100))]
#    
#    match = process.extractOne(name, candidate_matches['naam'].tolist())[0]
#    
#    out[name] = match
    

In [64]:
#politician_data.to_csv("../Data/politician_data/politician_data_use_for_correction_then_delete.csv", index=False)
#interim_key = pd.DataFrame.from_dict(out, orient='index', columns=['name_politician_data']).reset_index().rename(columns={'index':'name_all_elected'})
#interim_key.to_csv("../Data/politician_data/key_elected_people_to_pdc.csv", index = False)

## After manually matching the deficiencies

This contains a pretty good match, however, there are some inaccuracies. Hence, I conduct manual matching in the file `key_elected_people_to_pdc.csv`. Afterwards, I reimport it, so that the key is perfect. This then serves as the key on the basis of which to map the politicians in the all_elections data to the b1-nummers. On the basis of that, I merge it with the wealth data. 

Finally, I match the non-politicians in that same dataset on the name in my wealth dataset, because they contain exact matches (check this)


Super-finally, do another count of matching, where you only accord matches if they have an extremely high matching rate. This way, I make up for the missing matches. (We can check them by putting them in a dictionary). 




In [18]:
key_elected_people_pdc = pd.read_csv("../Data/politician_data/key_elected_people_to_pdc.csv")

ced = pd.merge(complete_elections_dataset,
         key_elected_people_pdc,
         how='left',
         left_on='name_in_elected_people',
         right_on='name_all_elected').drop(columns=['name_all_elected'])

# now merge with  b1-nummer
ced = pd.merge(ced, politician_data[['b1-nummer', 'naam']],
    how='left',
    left_on='name_politician_data',
    right_on='naam')

ced.drop(columns='naam')
#key_elected_people_pdc

Unnamed: 0,Naam,name_in_all_elections,name_in_elected_people,Aanbevolen door,Aantal stemmen,Procentueel,District,Verkiezingdatum,Type,Omvang electoraat,...,hoeveelste_keer_prob_alg,hoeveelste_keer_gewonnen,hoeveelste_keer_gewonnen_alg,hoeveelste_keer_gewonnen_tweedeproxy,hoevaak_gewonnen_alltime,hoevaak_gewonnen_verleden,hoevaak_gewonnen_toekomst,verk_2_10_gewonnen,name_politician_data,b1-nummer
0,S.A. de Moraaz,S.A. de Moraaz,S.A. de Moraaz,,503.0,52.84%,Alkmaar,1848-11-30,algemeen,1107,...,1,1,1,1,2,0,1,"[1, 0, 0, None, None, None, None, None, None]",S.A. de Moraaz,00943
1,G. van Leeuwen,,,,438.0,46.01%,Alkmaar,1848-11-30,algemeen,1107,...,1,,,0,0,0,0,"[None, None, None, None, None, None, None, Non...",,
2,mr. H.J. Smit,mr. H.J. Smit,H.J. Smit,,1566.0,79.86%,Alkmaar,1850-08-27,algemeen,2833,...,2,2,2,1,3,1,1,"[1, None, None, None, None, None, None, None, ...",H.J. Smidt,01272
3,S.A. de Moraaz,S.A. de Moraaz,S.A. de Moraaz,,1275.0,65.02%,Alkmaar,1850-08-27,algemeen,2833,...,2,2,2,1,2,1,0,"[0, 0, None, None, None, None, None, None, None]",S.A. de Moraaz,00943
4,jhr.mr. C. van Foreest,jhr.mr. C. van Foreest,C. van Foreest,,685.0,34.93%,Alkmaar,1850-08-27,algemeen,2833,...,1,,,0,9,0,9,"[1, 0, 0, 1, 1, 0, 0, 1, None]",C. van Foreest,00416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8452,B. Luteraan,,,SDP,32.0,0.41%,Zwolle,1913-06-17,algemeen,8889,...,3,,,0,0,0,0,"[None, None, None, None, None, None, None, Non...",,
8453,F.M. Knobel,F.M. Knobel,F.M. Knobel,VL(Lib/VD/SDAP),4249.0,50.15%,Zwolle,1913-06-25,herstemming,8889,...,2,1,0,1,2,0,1,"[1, None, None, None, None, None, None, None, ...",F.M. Knobel,00709
8454,A. baron van Dedem,A. baron van Dedem,A. baron van Dedem,CHU(Ka/AR),4223.0,49.85%,Zwolle,1913-06-25,herstemming,8889,...,15,,,0,13,13,0,"[None, None, None, None, None, None, None, Non...",A. baron van Dedem,00293
8455,F.M. Knobel,F.M. Knobel,F.M. Knobel,VL,3236.0,86.22%,Zwolle,1917-06-15,algemeen,9645,...,3,2,1,1,2,1,0,"[None, None, None, None, None, None, None, Non...",F.M. Knobel,00709


In [19]:
# now retrieve wealth and merge
wealth = pd.read_csv("../Data/politician_data/wealth_politicians.csv")[['indexnummer', 'w_deflated']]

ced = pd.merge(ced, wealth,
         how='left',
         left_on='b1-nummer',
         right_on='indexnummer'
        )


In [20]:
# now match non-politicians (exact match)
nonpols = pd.read_csv("../Data/analysis/unmatched_sample_analysis.csv").iloc[:,1:]
nonpols = nonpols[nonpols['b1-nummer'].isna()]
nonpols = nonpols[~nonpols['Vermogen_deflated'].isna()][['Naam', 'Vermogen_deflated']].drop_duplicates()

In [21]:
all_together = pd.merge(ced, nonpols, 
         how='left', 
         left_on='Naam', 
         right_on='Naam').drop(columns='indexnummer')

all_together['deflated_wealth'] = np.where(all_together['w_deflated'].isna(), all_together['Vermogen_deflated'], all_together['w_deflated'])

all_together = all_together.drop(columns=['w_deflated', 'Vermogen_deflated'])

In [22]:
# Split up the future election list-column
verk_sep = pd.DataFrame(all_together['verk_2_10_gewonnen'].tolist(), columns = ['verk_2_gewonnen',
                                                                   'verk_3_gewonnen',
                                                                   "verk_4_gewonnen",
                                                                   "verk_5_gewonnen",
                                                                   "verk_6_gewonnen",
                                                                    "verk_7_gewonnen",
                                                                    "verk_8_gewonnen",
                                                                    "verk_9_gewonnen",
                                                                    "verk_10_gewonnen"])

all_together = pd.concat([all_together, verk_sep], axis = 1).drop(columns='verk_2_10_gewonnen')

In [23]:
all_together.to_csv("../Data/analysis/full_sample_analysis_novars.csv")