In [3]:
# Load the libraries

import pandas as pd
import numpy as np
import re
from pandas_ods_reader import read_ods

from scipy import stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.compat import lzip
import numpy as np
import matplotlib.pyplot as plt

from matplotlib import pyplot as plt

from tqdm import tqdm

import fuzzywuzzy as fw
from fuzzy_match import match
from fuzzy_match import algorithims
from fuzzywuzzy import process



In [4]:
def decode_accents(name):
    try:
        return name.encode('latin-1').decode('raw_unicode_escape').encode('latin-1').decode('utf-8')
    except:
        return name

In [5]:
all_candidates_elections = pd.read_csv("../Data/elections/election_results_details.csv", encoding='utf-8-sig')
all_candidates_elections["Naam"] = all_candidates_elections["Naam"].apply(lambda x : decode_accents(x))
all_candidates_elections['Aantal stemmen'] = pd.to_numeric(all_candidates_elections['Aantal stemmen'], errors="coerce")

all_candidates_elections.head(2)

Unnamed: 0.1,Unnamed: 0,Naam,Aanbevolen door,Aantal stemmen,Procentueel,District,Verkiezingdatum,Type,Omvang electoraat,Opkomst,Aantal stembriefjes,Aantal stemmen geldig,Aantal stemmen blanco,Aantal zetels,Kiesdrempel
0,0,mr. B. Wichers,,700.0,83.73%,Groningen,30/11/1848,algemeen,1191,838,838,836,0,1,418
1,1,W.L. de Sturler,,34.0,4.07%,Groningen,30/11/1848,algemeen,1191,838,838,836,0,1,418


## Recompute the opkomst

Because it is not correct in a couple of districts. 

In [6]:
aantalstemmen = all_candidates_elections.groupby(['District', 'Verkiezingdatum']).agg({'Aantal stemmen':'sum'}).reset_index()

#aantalstemmen
all_candidates_elections = all_candidates_elections.merge(aantalstemmen, how='left', 
                               left_on = ['District', 'Verkiezingdatum'],
                              right_on = ['District', 'Verkiezingdatum'])

del(all_candidates_elections['Opkomst'])

all_candidates_elections = all_candidates_elections.rename(columns={'Aantal stemmen_y':'Opkomst',
                                        'Aantal stemmen_x':'Aantal stemmen'})

In [7]:
def to_int(x):
    try:
        return int(x)
    except:
        return None
    
#all_candidates_elections['Aantal stemmen'] = all_candidates_elections['Aantal stemmen'].apply(to_int)
all_candidates_elections['Aantal zetels'] = all_candidates_elections['Aantal zetels'].apply(to_int)
all_candidates_elections['Aantal stemmen geldig'] = all_candidates_elections['Aantal stemmen geldig'].apply(to_int)

In [8]:
all_candidates_elections

Unnamed: 0.1,Unnamed: 0,Naam,Aanbevolen door,Aantal stemmen,Procentueel,District,Verkiezingdatum,Type,Omvang electoraat,Aantal stembriefjes,Aantal stemmen geldig,Aantal stemmen blanco,Aantal zetels,Kiesdrempel,Opkomst
0,0,mr. B. Wichers,,700.0,83.73%,Groningen,30/11/1848,algemeen,1191,838,836.0,0,1.0,418,791.0
1,1,W.L. de Sturler,,34.0,4.07%,Groningen,30/11/1848,algemeen,1191,838,836.0,0,1.0,418,791.0
2,2,mr. A. Oudeman,,22.0,2.63%,Groningen,30/11/1848,algemeen,1191,838,836.0,0,1.0,418,791.0
3,3,L.T. Jorissen,,21.0,2.51%,Groningen,30/11/1848,algemeen,1191,838,836.0,0,1.0,418,791.0
4,4,jhr.mr. O.Q.J.J. van Swinderen,,14.0,1.67%,Groningen,30/11/1848,algemeen,1191,838,836.0,0,1.0,418,791.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8233,8233,mr. H. van der Vegte,,3440.0,43.79%,Ommen,28/10/1912,herstemming,8667,7909,7856.0,53,1.0,3928,7745.0
8234,8234,jhr.mr. D.J. de Geer,,3667.0,53.31%,Schiedam,22/10/1907,herstemming,8083,6916,6878.0,38,1.0,3439,6878.0
8235,8235,H.J. Versteeg,,3211.0,46.69%,Schiedam,22/10/1907,herstemming,8083,6916,6878.0,38,1.0,3439,6878.0
8236,8236,W.P.G. Helsdingen,,3340.0,52.55%,Franeker,15/10/1907,herstemming,7514,6409,6356.0,53,1.0,3178,6356.0


## Attention

In this function, pay attention how I define margin. Surprisingly, in almost no studies that use close elections, much attention is paid to this detail, presumably because it is all american and these people do not care much about elections with more than two candidates, so they always put the margin at 50%. 

My margin is defined as:

$$
\text{margin} = \begin{cases}
\text{Votes}_i - \text{# Votes Next Loser} &\mbox{ if } i \text{ is a winner} \\
\text{Votes}_i - \text{# Votes Next Winner} &\mbox{ if } i \text{ is a loser}
\end{cases}
$$


In [9]:
# Now in this function, we reproduce the dataset and create the margin:

#df is supposed to be all_candidates_elections

def get_margin(df):
    ## First, create a dataframe that contains only the number of seats per election
    zetels = all_candidates_elections.groupby(['District', 'Verkiezingdatum']).agg({'Aantal zetels': 'mean'})

    margins = []

    for i in tqdm(range(len(df))):
    
        try:
            distr = all_candidates_elections.loc[i,'District']
            date = all_candidates_elections.loc[i,'Verkiezingdatum']
            number_of_seats = zetels.loc[distr, date][0]
            
        # Filter the dataframe according to district and election date - and select the observation that is the marginal winner
    
            amountvotes_marginalwinner = (all_candidates_elections[(all_candidates_elections['District'] == distr) & (all_candidates_elections['Verkiezingdatum'] == date)].
             sort_values('Aantal stemmen', ascending = False).reset_index(drop=True).loc[number_of_seats-1,'Aantal stemmen'])
        
            amountvotes_marginalloser = (all_candidates_elections[(all_candidates_elections['District'] == distr) & (all_candidates_elections['Verkiezingdatum'] == date)].
             sort_values('Aantal stemmen', ascending = False).reset_index(drop=True).loc[number_of_seats,'Aantal stemmen'])
            
            # If I am the winner, I get a positive margin
            if all_candidates_elections.loc[i,'Aantal stemmen'] >= amountvotes_marginalwinner:
                margin = (all_candidates_elections.loc[i, 'Aantal stemmen'] - amountvotes_marginalloser) /all_candidates_elections.loc[i,'Opkomst']
            
            # If I am the loser, I get a negative margin
            else: 
                margin = (all_candidates_elections.loc[i,'Aantal stemmen'] - amountvotes_marginalwinner)/all_candidates_elections.loc[i,'Opkomst']
        
            margins.append(margin)
        
        except:
            
            margins.append(999)

    
    df['margin'] = pd.Series(margins)
    
    return(df)

In [10]:
df_margins = get_margin(all_candidates_elections)

100%|██████████| 8238/8238 [00:39<00:00, 210.83it/s]


In [16]:
# Now, convert a dataframe to winners and losers, and find out which ones are losers

winners = df_margins[df_margins['margin'] > 0.0]
winners = winners[winners['margin'] != 999]
losers = df_margins[df_margins['margin'] < 0.0]

## The next three chunks:

- Find out which one of these are politicians, which ones aren't

- Merge them with the datasets I have at present

- Find out which observations I missed in the datasets I have at present, and ameliorate them


In [29]:
# Which ones are close losers
closelosers = losers[(-losers['Naam'].isin(winners['Naam'])) & (abs(losers['margin']) < 0.3)]

In [31]:
# Find the close losers that are not included in the previous list:
previouslist = pd.read_csv("../Administration/new_data_entry_file.csv")

In [37]:
# Put the potential missing in a list, and export to .csv
potentialmissing = closelosers[-closelosers['Naam'].isin(previouslist['Naam'])]
potentialmissing.to_csv("../Administration/potentialmissing.csv")