## elections_to_full_data

This code file departs from the election list. We implement several steps in this file. The first objective is to match the names of the elections with the names of the politicians, and also with the names of the non-politicians whose data we have collected. 

The first thing we do is load the packages, and then clean up the dataset and compute the margin of victory of all the politicians. 

In [2]:
# Load the libraries

import pandas as pd
import numpy as np
import re
from pandas_ods_reader import read_ods
import statistics

import numpy as np
import matplotlib.pyplot as plt

from matplotlib import pyplot as plt

from tqdm import tqdm

import fuzzywuzzy as fw
from fuzzy_match import match
from fuzzy_match import algorithims
from fuzzywuzzy import process



In [3]:
# Import the election data
elected_people = pd.read_csv("../Data/elections/allelected.csv")

In [4]:
elected_people['naam'] = elected_people['voornaam'] + ' ' + elected_people['achternaam']
elected_people['verkiezingdatum'] = (elected_people['dag'].astype(str) + 
                                     '-' + 
                                     elected_people['maand'].astype(str) +
                                     '-' +
                                     elected_people['jaar'].astype(str)
                                    )
elected_people['verkiezingdatum'] = elected_people['verkiezingdatum'].apply(lambda x: pd.Timestamp(x))

In [5]:
election_results_details = pd.read_csv("../Data/elections/election_results_details.csv").iloc[:,1:]
election_results_details['Verkiezingdatum'] = (election_results_details['Verkiezingdatum'].
                                               apply(lambda x: pd.Timestamp(x))
                                              )

def get_zetels(df):
    a = pd.to_numeric(df['Aantal zetels'], errors='coerce')
    b = np.mean(a)
    return (b)

aantal_zetels = (election_results_details.groupby(['District', 'Verkiezingdatum']).
                 apply(get_zetels).reset_index().rename(columns={0:'Aantal zetels'})
                )

In [6]:
pd.merge(elected_people, aantal_zetels, 
         left_on=['districtsnaam', 'verkiezingdatum'],
         right_on=['District', 'Verkiezingdatum']).drop(columns=['District', 'Verkiezingdatum'])

Unnamed: 0,achternaam,voornaam,tussenvoegsel,jaar,maand,dag,type verkiezing,districtsnaam,aantal stemmen,omvang_electoraat,zetels,drempel,naam,verkiezingdatum,Aantal zetels
0,Aalberse,P.J.M.,,1903,2,18,tussentijds,Almelo,3821,7865,1.0,2953.0,P.J.M. Aalberse,1903-02-18,1.0
1,Aalberse,P.J.M.,,1905,6,16,algemeen,Almelo,5217,9324,1.0,3922.0,P.J.M. Aalberse,1905-06-16,1.0
2,Aalberse,P.J.M.,,1909,6,11,algemeen,Almelo,4744,9087,1.0,3498.0,P.J.M. Aalberse,1909-11-06,1.0
3,Aalberse,P.J.M.,,1913,6,17,algemeen,Almelo,5338,10107,1.0,3734.0,P.J.M. Aalberse,1913-06-17,1.0
4,Akerlaken,D. van,,1850,8,27,algemeen,Hoorn,1319,3430,2.0,1122.0,D. van Akerlaken,1850-08-27,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2423,Zuylen van Nijevelt,J.P.J.A. graaf van,,1871,6,13,periodiek,Arnhem,1009,2863,1.0,989.0,J.P.J.A. graaf van Zuylen van Nijevelt,1871-06-13,1.0
2424,Zwaag,G.L. van der,,1897,6,25,herstemming,Schoterland,1941,5584,1.0,1796.0,G.L. van der Zwaag,1897-06-25,1.0
2425,Zwaag,G.L. van der,,1901,6,27,herstemming,Weststellingwerf,1984,5228,1.0,1765.0,G.L. van der Zwaag,1901-06-27,1.0
2426,Zwaag,G.L. van der,,1901,6,27,herstemming,Schoterland,2168,5184,1.0,1809.0,G.L. van der Zwaag,1901-06-27,1.0


In [77]:
all_candidates = pd.read_csv("../Data/elections/election_results_details.csv").iloc[:,1:]
all_candidates['Aantal zetels'] = all_candidates['Aantal zetels'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
all_candidates['Verkiezingdatum'] = (all_candidates['Verkiezingdatum'].
                                     apply(lambda x: pd.Timestamp(x))
                                    )
all_candidates['Aantal stemmen'] = (all_candidates['Aantal stemmen'].
                                    apply(lambda x: pd.to_numeric(x, errors='coerce'))
                                   )

aantal_stemmen = (all_candidates.groupby(['District','Verkiezingdatum']).
 apply(lambda x: sum(x['Aantal stemmen'])).
 reset_index().
 rename(columns={0:'totaal aantal stemmen'})
)

all_candidates = pd.merge(all_candidates, aantal_stemmen,
        left_on=['District', 'Verkiezingdatum'],
        right_on=['District', 'Verkiezingdatum'])

all_candidates = (all_candidates.groupby(['District', 'Verkiezingdatum']).
 apply(lambda x: x.sort_values(['Aantal stemmen'], 
                               ascending = False))
).reset_index(drop=True)

all_candidates['hoeveelste_in_verkiezing'] = (all_candidates.groupby(['District', 'Verkiezingdatum']).
                                              cumcount() + 1)

all_candidates['gewonnen'] = np.where(all_candidates['hoeveelste_in_verkiezing'] <= all_candidates['Aantal zetels'], 1, 0)
all_candidates['marginal_winner'] = np.where(all_candidates['Aantal zetels'] - all_candidates['hoeveelste_in_verkiezing'] == 0, 1, 0)
all_candidates['marginal_loser'] = np.where(all_candidates['Aantal zetels'] - all_candidates['hoeveelste_in_verkiezing'] == -1, 1, 0)

In [98]:
def get_margin(dataframe):
    
    out = pd.DataFrame()
    
    for i in tqdm(range(len(dataframe))):
        
        try:
        
            distr = dataframe.iloc[i]['District']
            date = dataframe.iloc[i]['Verkiezingdatum']
        
            if dataframe.iloc[i]['gewonnen'] == 0:
            
                votes_marginal_winner = dataframe[(dataframe['District'] == distr) & 
                  (dataframe['Verkiezingdatum'] == date) &
                  (dataframe['marginal_winner'] == 1)]['Aantal stemmen'].values[0]
            
                margin = (dataframe.iloc[i]['Aantal stemmen'] - votes_marginal_winner)/dataframe.iloc[i]['totaal aantal stemmen']
        
            if dataframe.iloc[i]['gewonnen'] == 1:
        
                votes_marginal_loser = dataframe[(dataframe['District'] == distr) & 
                  (dataframe['Verkiezingdatum'] == date) &
                  (dataframe['marginal_loser'] == 1)]['Aantal stemmen'].values[0]
            
                margin = (dataframe.iloc[i]['Aantal stemmen'] - votes_marginal_loser)/dataframe.iloc[i]['totaal aantal stemmen']
    
        except:
            
            margin = None
            
        interim = dataframe.iloc[i:i+1]
        interim = interim.assign(margin = margin)
            
        out = out.append(interim)
        
    return out

In [None]:
all_candidates = get_margin(all_candidates)

 70%|███████   | 5788/8238 [00:57<00:25, 95.39it/s] 