# Exploring presidential candidate endorsements

In [1]:
import json
import csv
from collections import Counter

## 1. Loading the endorsement JSON file

In [2]:
# NOTE: if you are using windows, you need to mind two additional things:
#  1. You must double-escape your antislashes likewise: C:\\path\\to\\thing, else it will trigger syntax errors
#  2. You will need to pass the `encoding` kwarg set to "utf8" to the `open` function
with open('../data/parrainages.json') as f:
    ENDORSEMENTS_RAW_DATA = json.load(f)

In [3]:
ENDORSEMENTS_RAW_DATA[0]

{'Civilite': 'M.',
 'Nom': 'CORDIVAL',
 'Prenom': 'Gilles',
 'Mandat': 'Maire',
 'Circonscription': 'Mont-Saint-Père',
 'Departement': 'Aisne',
 'Candidat': 'ARTHAUD Nathalie',
 'DatePublication': '2022-02-01T00:00:00'}

In [4]:
len(ENDORSEMENTS_RAW_DATA), type(ENDORSEMENTS_RAW_DATA)

(13427, list)

In [5]:
# Reshaping the data
ENDORSEMENTS_DATA = []

for record in ENDORSEMENTS_RAW_DATA:
    ENDORSEMENTS_DATA.append({
        'name': record['Prenom'],
        'surname': record['Nom'],
        'mandate': record['Mandat'],
        'department': record['Departement'],
        'endorsement_date': record['DatePublication'],
        'gender': 'male' if record['Civilite'] == 'M.' else 'female',
        'candidate': record['Candidat']
    })

In [6]:
ENDORSEMENTS_DATA[0]

{'name': 'Gilles',
 'surname': 'CORDIVAL',
 'mandate': 'Maire',
 'department': 'Aisne',
 'endorsement_date': '2022-02-01T00:00:00',
 'gender': 'male',
 'candidate': 'ARTHAUD Nathalie'}

### Some stats about gender and candidates

In [7]:
gender_stats = {}

for record in ENDORSEMENTS_DATA:
    gender = record['gender']
    
    if gender in gender_stats:
        gender_stats[gender] += 1 # same as `gender_stats[gender] = gender_stats[gender] + 1`
    else:
        gender_stats[gender] = 1

In [8]:
gender_stats

{'male': 9889, 'female': 3538}

In [9]:
def ratio(d, n):
    ratios = {}
    
    for k, v in d.items():
        ratios[k] = v / n
        
    return ratios

In [10]:
ratio(gender_stats, len(ENDORSEMENTS_DATA))

{'male': 0.7365010799136069, 'female': 0.2634989200863931}

In [11]:
candidates_stats = Counter()

for record in ENDORSEMENTS_DATA:
    candidate = record['candidate']
    candidates_stats[candidate] += 1

In [12]:
sorted(candidates_stats.items(), key=lambda x: x[1], reverse=True)[:15]

[('PÉCRESSE Valérie', 2636),
 ('MACRON Emmanuel', 2098),
 ('HIDALGO Anne', 1440),
 ('MÉLENCHON Jean-Luc', 906),
 ('ZEMMOUR Éric', 741),
 ('JADOT Yannick', 712),
 ('LASSALLE Jean', 642),
 ('ROUSSEL Fabien', 626),
 ('LE PEN Marine', 622),
 ('DUPONT-AIGNAN Nicolas', 600),
 ('POUTOU Philippe', 596),
 ('ARTHAUD Nathalie', 576),
 ('ASSELINEAU François', 293),
 ('TAUBIRA Christiane', 274),
 ('KAZIB Anasse', 160)]

In [13]:
candidates_stats.most_common(5)

[('PÉCRESSE Valérie', 2636),
 ('MACRON Emmanuel', 2098),
 ('HIDALGO Anne', 1440),
 ('MÉLENCHON Jean-Luc', 906),
 ('ZEMMOUR Éric', 741)]

In [14]:
# Generator comprehension
Counter(record['candidate'] for record in ENDORSEMENTS_DATA).most_common(5)

[('PÉCRESSE Valérie', 2636),
 ('MACRON Emmanuel', 2098),
 ('HIDALGO Anne', 1440),
 ('MÉLENCHON Jean-Luc', 906),
 ('ZEMMOUR Éric', 741)]

In [15]:
# List comprehension
len([record for record in ENDORSEMENTS_DATA if record['gender'] == 'female'])

3538

In [16]:
# Same as above
female_endorsements = []

for record in ENDORSEMENTS_DATA:
    if record['gender'] == 'female':
        female_endorsements.append(record)

len(female_endorsements)

3538

### Goals so we can keep only the relevant endorsements:
  1. Find the proportion of mayors endorsements
  2. Find the running candidates (>= 500 endorsements)
  3. Filter out irrelevant endorsements:
      - From non running candidates
      - Endorsements from non mayors

In [17]:
# The mandate value is usually messy and "Maire" only won't cut it
Counter(record['mandate'] for record in ENDORSEMENTS_DATA).most_common(10)

[('Maire', 8712),
 ('Conseillère départementale', 854),
 ('Conseiller départemental', 740),
 ('Conseillère régionale', 581),
 ('Conseiller régional', 511),
 ("Maire délégué d'une commune associée ou d'une commune déléguée", 345),
 ('Député', 309),
 ('Députée', 221),
 ('Sénateur', 191),
 ("Membre d'une assemblée d'une collectivité territoriale d'outre-mer à statut particulier",
  150)]

In [18]:
# Some reminders about substring tests in python
string = 'Hello my friend'
'friend' in string

True

In [19]:
string.lower()

'hello my friend'

In [20]:
'maire' in string.lower()

False

In [21]:
# Defining a function to have a fuzzy condition of mayor
def is_endorsement_mayor(s):
    return 'maire' in s.lower()

In [22]:
# snake_case
# camelCase
# kebab-case
# CapitalizedCase
# CONSTANT_CASE

In [23]:
# Mayor ratio
mayor_endorsement_count = 0

for record in ENDORSEMENTS_DATA:
    if is_endorsement_mayor(record['mandate']):
        mayor_endorsement_count += 1

str(int((mayor_endorsement_count / len(ENDORSEMENTS_DATA)) * 100)) + '%'

'68%'

In [24]:
running_candidates = []
for name, endorsements_count in candidates_stats.items():
    if endorsements_count >= 500:
        running_candidates.append(name)
        
print('The running candidates are:')
for name in running_candidates:
    print('  -', name)
print()
print('We have', len(running_candidates), 'of them')

The running candidates are:
  - ARTHAUD Nathalie
  - DUPONT-AIGNAN Nicolas
  - HIDALGO Anne
  - JADOT Yannick
  - LASSALLE Jean
  - LE PEN Marine
  - MACRON Emmanuel
  - MÉLENCHON Jean-Luc
  - PÉCRESSE Valérie
  - POUTOU Philippe
  - ROUSSEL Fabien
  - ZEMMOUR Éric

We have 12 of them


In [25]:
# Test that something exists in a list
'MACRON Emmanuel' in running_candidates, 'PLIQUE Guillaume' in running_candidates

(True, False)

In [26]:
RELEVANT_ENDORSEMENTS_DATA = []

for record in ENDORSEMENTS_DATA:
    if (
        is_endorsement_mayor(record['mandate']) and
        record['candidate'] in running_candidates
    ):
        RELEVANT_ENDORSEMENTS_DATA.append(record)

len(RELEVANT_ENDORSEMENTS_DATA) / len(ENDORSEMENTS_DATA)

0.6045281894689805

In [27]:
# Updated mayor ratio
mayor_endorsement_count = 0

for record in RELEVANT_ENDORSEMENTS_DATA:
    if is_endorsement_mayor(record['mandate']):
        mayor_endorsement_count += 1

str(int((mayor_endorsement_count / len(ENDORSEMENTS_DATA)) * 100)) + '%'

'60%'

## 2. Loading the RNE CSV file

In [28]:
# `iso8859` and `latin1` are the same encoding but the former works on most windows, not the latter...
with open('../data/rne-maires.csv', encoding='iso8859') as f:
    reader = csv.DictReader(f, delimiter=';')
    RNE_RAW_DATA = list(reader)

In [29]:
len(RNE_RAW_DATA)

34921

In [30]:
RNE_RAW_DATA[0]

{'Code du département': '01',
 'Libellé du département': 'Ain',
 'Code de la collectivité à statut particulier': '',
 'Libellé de la collectivité à statut particulier': '',
 'Code de la commune': '01001',
 'Libellé de la commune': "L'Abergement-Clémenciat",
 "Nom de l'élu": 'BOULON',
 "Prénom de l'élu": 'Daniel',
 'Code sexe': 'M',
 'Date de naissance': '04/03/1951',
 'Code de la catégorie socio-professionnelle': '74',
 'Libellé de la catégorie socio-professionnelle': 'Ancien cadre',
 'Date de début du mandat': '18/05/2020',
 'Date de début de la fonction': '26/05/2020'}

In [31]:
# If you ever need to convert the file to JSON
# with open('../data/rne-maires.json', 'w') as f:
#    json.dump(RNE, f, indent=2, ensure_ascii=False)

In [32]:
RNE_DATA = []

for record in RNE_RAW_DATA:
    birth_year = record['Date de naissance'].split('/')[-1]
    # In our case, same as:
    birth_year = record['Date de naissance'][-4:]
    
    RNE_DATA.append({
        'surname': record["Nom de l'élu"],
        'name': record["Prénom de l'élu"],
        'department': record['Libellé du département'],
        'profession': record['Libellé de la catégorie socio-professionnelle'],
        'birth_year': birth_year,
        'gender': 'male' if record['Code sexe'] == 'M' else 'female',
        'age': 2022 - int(birth_year)
    })

In [33]:
RNE_DATA[0]

{'surname': 'BOULON',
 'name': 'Daniel',
 'department': 'Ain',
 'profession': 'Ancien cadre',
 'birth_year': '1951',
 'gender': 'male',
 'age': 71}

In [34]:
import statistics

In [35]:
ages = [record['age'] for record in RNE_DATA]
statistics.mean(ages)

61.33375332894247

In [36]:
sorted_ages = sorted(ages)
half = len(sorted_ages) // 2
sorted_ages[half]

63

In [37]:
4 in [1, 2, 3] # O(n)
4 in {1: 'one', 2: 'two', 3: 'three', 4: 'four'} # O(1)

True

In [38]:
def mayor_key(record):
    return (
        record['name'].lower(),
        record['surname'].lower(),
        record['department'].lower()
    )

In [39]:
print(RELEVANT_ENDORSEMENTS_DATA[0])
print(mayor_key(RELEVANT_ENDORSEMENTS_DATA[0]))
print(type(mayor_key(RELEVANT_ENDORSEMENTS_DATA[0])))

{'name': 'Gilles', 'surname': 'CORDIVAL', 'mandate': 'Maire', 'department': 'Aisne', 'endorsement_date': '2022-02-01T00:00:00', 'gender': 'male', 'candidate': 'ARTHAUD Nathalie'}
('gilles', 'cordival', 'aisne')
<class 'tuple'>


In [40]:
# This is how we would do in languages that don't have tuples
'+'.join(mayor_key(RELEVANT_ENDORSEMENTS_DATA[0]))

'gilles+cordival+aisne'

In [41]:
'cordival+gilles+aisne'.split('+')

['cordival', 'gilles', 'aisne']

In [42]:
# Reverse slice
'04/03/1951'[-4:]

'1951'

In [43]:
ENDORSEMENTS_INDEX = {}

for record in RELEVANT_ENDORSEMENTS_DATA:
    key = mayor_key(record)
    ENDORSEMENTS_INDEX[key] = record

In [44]:
# Trying to assess whether keys exists in a dictionary
dummy_dict = {1: 'one'}

if 2 not in dummy_dict:
    print('not found - in')

try:
    dummy_dict[2]
except KeyError:
    print('not found - except')
    
print(dummy_dict.get(2, 'not found - get'))

not found - in
not found - except
not found - get


In [45]:
# What we are doing schematically
left = [('Judy', 34), ('June', 55), ('Alix', 33)]
right = [('Michael', 'red'), ('Judy', 'yellow'), ('Alix', 'orange'), ('Roger', 'purple')]

# Using O(n²) loops
operations = 0
for name, age in left:
    for other_name, color in right:
        operations += 1
        if name == other_name:
            print('We matched', name, age, color)
            
print('We made %i operations' % operations)
            
# Using a dict (an order of magnitude faster)
print()
operations = 0
left_index = {}

for name, age in left:
    left_index[name] = age
    
for name, color in right:
    operations += 1
    other_age = left_index.get(name)
    
    if other_age is not None:
        print('We matched', name, other_age, color)
        
print('We made %i operations' % operations)

We matched Judy 34 yellow
We matched Alix 33 orange
We made 12 operations

We matched Judy 34 yellow
We matched Alix 33 orange
We made 4 operations


In [46]:
# Matching the RNE
matches_count = 0
for record in RNE_DATA:
    matching_endorsement = ENDORSEMENTS_INDEX.get(mayor_key(record))
    
    if matching_endorsement is not None:
        # Enriching the endorsement records with RNE ones (age, profession, etc.)
        matching_endorsement['age'] = record['age']
        matching_endorsement['birth_year'] = record['birth_year']
        matching_endorsement['profession'] = record['profession']
        matches_count += 1
        
matches_count, matches_count / len(RELEVANT_ENDORSEMENTS_DATA)

(7629, 0.9398792657385734)

In [47]:
RELEVANT_ENDORSEMENTS_DATA[0]

{'name': 'Gilles',
 'surname': 'CORDIVAL',
 'mandate': 'Maire',
 'department': 'Aisne',
 'endorsement_date': '2022-02-01T00:00:00',
 'gender': 'male',
 'candidate': 'ARTHAUD Nathalie',
 'age': 59,
 'birth_year': '1963',
 'profession': 'Professeur des écoles, instituteur et assimilé'}

### Research questions using RNE:

- Check the proportion of female mayors in the RNE vs. proportion of female mayors endorsing candidates.
- Check if there is a difference between candidates endorsed by male mayors vs. candidates endorsed by female mayors (*mind the ratio between both to avoid comparing absolute numbers in vain*)
- Compare mean, median, min, max and stdev of mayor age in RNE vs. endorsing mayors, i.e. are endorsing mayors a representative sample wrt. age of the total mayor population?
- Take a look at endorsing mayor professions, find what candidates are endorsed by teachers, for instance, or factory workers etc.

In [48]:
# Beware: some mayors were not matched in RNE, so some of the records don't have an age nor a profession
filtered_ages1 = []

for record in RELEVANT_ENDORSEMENTS_DATA:
    if 'age' in record:
        filtered_ages1.append(record['age'])
        
len(filtered_ages1), len(RELEVANT_ENDORSEMENTS_DATA), filtered_ages1[:5]

(7621, 8117, [59, 73, 59, 62, 66])

In [49]:
# Shorthand syntax for the above (filtered list comprehension)
filtered_ages2 = [record['age'] for record in RELEVANT_ENDORSEMENTS_DATA if 'age' in record]

In [50]:
filtered_ages1 == filtered_ages2

True

#### 1. Female mayor proportions

In [51]:
female_endorsements = [r for r in ENDORSEMENTS_DATA if r['gender'] == 'female']
female_relevant_endorsements = [r for r in RELEVANT_ENDORSEMENTS_DATA if r['gender'] == 'female']
female_mayors = [r for r in RNE_DATA if r['gender'] == 'female']

print('Endorsing', len(female_endorsements) / len(ENDORSEMENTS_DATA))
print('Endorsing candidates', len(female_relevant_endorsements) / len(RELEVANT_ENDORSEMENTS_DATA))
print('Mayors', len(female_mayors) / len(RNE_DATA))

Endorsing 0.2634989200863931
Endorsing candidates 0.1511642232351854
Mayors 0.20050972194381605


#### 2. Female endorsements proportion per candidate

In [52]:
candidates_to_genders = {}

for record in RELEVANT_ENDORSEMENTS_DATA:
    candidate = record['candidate']
    gender = record['gender']
    
    if candidate not in candidates_to_genders:
        candidates_to_genders[candidate] = Counter()
    
    candidates_to_genders[candidate][gender] += 1
        
    
for candidate, genders in candidates_to_genders.items():
    print(candidate, genders['female'] / genders['male'])

ARTHAUD Nathalie 0.24190064794816415
DUPONT-AIGNAN Nicolas 0.11299435028248588
HIDALGO Anne 0.26017699115044246
JADOT Yannick 0.233201581027668
LASSALLE Jean 0.10070671378091872
LE PEN Marine 0.19011406844106463
MACRON Emmanuel 0.16756756756756758
MÉLENCHON Jean-Luc 0.19112627986348124
PÉCRESSE Valérie 0.1826012058570198
POUTOU Philippe 0.20842572062084258
ROUSSEL Fabien 0.1791907514450867
ZEMMOUR Éric 0.12773109243697478


#### 3. Investigating age stats

In [53]:
def compute_age_stats(data):
    ages = [r['age'] for r in data if 'age' in r]
    
    return {
        'min': min(ages),
        'max': max(ages),
        'mean': statistics.mean(ages),
        'median': statistics.median(ages),
        'stdev': statistics.median(ages)
    }

In [54]:
rne_age_stats = compute_age_stats(RNE_DATA)
endorsement_age_stats = compute_age_stats(RELEVANT_ENDORSEMENTS_DATA)

print('For RNE:', rne_age_stats)
print('For relevant endorsements:', endorsement_age_stats)

For RNE: {'min': 21, 'max': 93, 'mean': 61.33375332894247, 'median': 63, 'stdev': 63}
For relevant endorsements: {'min': 22, 'max': 93, 'mean': 61.37317937278572, 'median': 63, 'stdev': 63}


#### 4. Investigating mayor professions

In [55]:
mayor_professions = Counter()

for record in RELEVANT_ENDORSEMENTS_DATA:
    profession = record.get('profession')
    
    if profession is None:
        continue
        
    mayor_professions[profession] += 1

mayor_professions.most_common(15)

[('Ancien cadre', 1298),
 ('Cadre de la fonction publique', 517),
 ('Ancien employé', 482),
 ('Agriculteur sur moyenne exploitation', 454),
 ("Cadre administratif et commercial d'entreprise", 414),
 ('Ancien agriculteur exploitant', 369),
 ("Ancien artisan, commerçant, chef d'entreprise", 341),
 ('Professeur, profession scientifique', 339),
 ('Profession libérale', 339),
 ('Ancienne profession intermédiaire', 333),
 ("Ingénieur et cadre technique d'entreprise", 272),
 ('Technicien', 188),
 ('Agriculteur sur petite exploitation', 178),
 ('Employé civil et agent de service de la fonction publique', 171),
 ('Artisan', 169)]

In [56]:
def could_be_teacher(label):
    return 'professeu' in label.lower()

In [57]:
could_be_teacher('Professeur des universités')

True

In [58]:
could_be_teacher('Magasinier professionnel')

False

In [59]:
teacher_labels = Counter()

for label, count in mayor_professions.items():
    if could_be_teacher(label):
        teacher_labels[label] = count
        
teacher_labels.most_common()

[('Professeur, profession scientifique', 339),
 ('Professeur des écoles, instituteur et assimilé', 125)]

In [60]:
# continue, break examples
for i in range(6):
    if i < 3:
        continue
        
    if i > 4:
        break
        
    print(i)

3
4
