# Analyze speakers

In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/euroleaks/parsed.csv')
df.head()

Unnamed: 0,speaker,speech,timestamp,date
0,Jeroen Dijsselbloem,… of your responses or questions. And can I fi...,1900-01-01 00:00:00,2015-02-24 00:00:00
1,Speaker 2,"Uh, yes, uh, thank you, Jeroen. Well, uh, comm...",1900-01-01 00:00:10,2015-02-24 00:00:00
2,Michael Noonan,Michael Noonan.,1900-01-01 00:01:27,2015-02-24 00:00:00
3,Speaker 2,"Uh, it is therefore regrettable that, uh-",1900-01-01 00:01:29,2015-02-24 00:00:00
4,Speaker 3,Has entered the conference.,1900-01-01 00:01:33,2015-02-24 00:00:00


### handle missing speaker

In [3]:
df[df.speaker.isnull()]

Unnamed: 0,speaker,speech,timestamp,date
1394,,"Jeroen Dijsselbloem\nNow, let’s see who is on ...",,2015-07-01 00:00:00


In [4]:
df.speaker.loc[df.speaker.isnull()] = 'jeroen dijsselbloem'

## inspect unique speakers

In [5]:
# strip and make lowercase
df.speaker = df.speaker.apply(lambda s: s.strip().lower() if not pd.isnull(s) else s)

In [6]:
# display all the names

for s in df.speaker.unique():
    if 'speaker' not in s:
        print(s)

jeroen dijsselbloem
michael noonan
pierre moscovici
mario draghi
wolfgang schäuble
christine lagarde
yanis varoufakis
yanis [not varoufakis]
luis de guindos
maria luís
marco buti
thomas wieser
declan costello
computer
benoit couré
paul thomsen
greek representative
thomas
benoit cœuré
nikos theocarakis
irina
irana
nabil
tooma
tropa
ricci
hans
paul
klaus regling
peter kažimír
martin
hans jörg schelling
dušan mramor
michel sapin
pier carlo padoan
edward scicluna
rimantas šadžius
poul thomsen
alexander stubb
inaudible
yanis varoufakis [privately]
johan van overtveldt
maria luís albuquerque
benoît cœuré
kian
male
group
johan
maria luis albuquerque
harris georgiades
translator
michel
luis pierre
luis
peter kazimir
wolfgang schauble
wolfgang


### drop some rows
For instance those which transcribe words said by computer.

In [7]:
#df[df.speech == 'Has entered the conference.']

In [8]:
df = df[df.speech != 'Has entered the conference.']

In [9]:
#df[df.speaker == 'group']

In [10]:
df = df[df.speaker != 'group']

In [11]:
#df[df.speaker == 'inaudible']

In [12]:
df = df[df.speaker != 'inaudible']

In [13]:
#for row in df[df.speaker == 'inaudible'].iterrows():
#    print(row[1].speech)
#    print()

Unidentified speakers:

In [14]:
search_term = 'speaker'

for speaker in df.speaker.apply(lambda s: s.strip().lower() if not pd.isnull(s) else s).unique():
    if not pd.isnull(speaker) and search_term in speaker:
        print(speaker)
        pass

speaker 2
speaker 5
speaker 9
speaker 10
speaker 6
speaker 7
speaker 8
speaker 11
speaker 12
speaker 13
speaker 14
speaker 19
speaker 1
speaker 3
unidentified speaker
speaker 16
speaker 20
speaker 4
speaker 17
speaker 18
speaker 21
speaker


### manually construct mapping of different version of the same name to that name
Note that there is Thomas Wieser and just Thomas, two distinct persons.

In [15]:
amend_names = {
    'wolfgang schäuble': [
        'wolfgang schäuble',
        'wolfgang schauble',
        'wolfgang'
    ],
    'peter kažimír': [
        'peter kažimír',
        'peter kazimir'
    ],
    'michel sapin': [
        'michel sapin',
        'michel'
    ],
    'maria luís albuquerque': [
        'maria luís albuquerque',
        'maria luís',
        'maria luis albuquerque'
    ],
    'johan van overtveldt': [
        'johan van overtveldt',
        'johan'
    ],
    'benoît cœuré': [
        'benoît cœuré',
        'benoit couré',
        'benoit cœuré'
    ],
    'hans jörg schelling': [
        'hans jörg schelling',
        'hans'
    ],
    'poul mathias thomsen': [
        'paul thomsen',
        'paul',
        'poul thomsen'
    ],
    'yanis varoufakis': [
        'yanis varoufakis',
        'yanis varoufakis [privately]'
    ],
    'luis de guindos': [
        'luis de guindos',
        'luis'
    ],
    'irina': [
        'irina',
        'irana'
    ],
    'jānis reirs': [
        'yanis [not varoufakis]'
    ],
    'luca antonio ricci': [
        'ricci'
    ],
    'thomas steffen': [
        'thomas'
    ]
}

In [20]:
# dump to json
import json

json = json.dumps(amend_names)
with open('../data/euroleaks/amend_names.json', 'w') as f:
    f.write(json)

In [16]:
# invert dict
amend_names_inv = {value: key for key,values in amend_names.items() for value in values}

In [17]:
# amend speaker names
df.speaker = df.speaker.apply(lambda s: amend_names_inv[s] if s in amend_names_inv.keys() else s)

In [18]:
for speaker in df.speaker.unique():
    if 'speaker' not in speaker:
        print(speaker)
        pass

jeroen dijsselbloem
michael noonan
pierre moscovici
mario draghi
wolfgang schäuble
christine lagarde
yanis varoufakis
jānis reirs
luis de guindos
maria luís albuquerque
marco buti
thomas wieser
declan costello
benoît cœuré
poul mathias thomsen
greek representative
thomas steffen
nikos theocarakis
irina
nabil
tooma
tropa
luca antonio ricci
hans jörg schelling
klaus regling
peter kažimír
martin
dušan mramor
michel sapin
pier carlo padoan
edward scicluna
rimantas šadžius
alexander stubb
johan van overtveldt
kian
male
harris georgiades
translator
luis pierre


Manually map speaker to entity.

In [32]:
# TODO maybe update after you get an answer to your email

# missing:
# estonia (Maris Lauri, Sven Sester)
# luxembourg (Pierre Gramegna)


speaker_to_entity = {
    'jeroen dijsselbloem': 'eurogroup president', # the netherlands
    'michael noonan': 'ireland',
    'pierre moscovici': 'european commission',
    'mario draghi': 'ecb',
    'wolfgang schäuble': 'germany',
    'thomas steffen': 'germany', # State Secretary at the Federal Ministry of Finance under Schauble
    'christine lagarde': 'imf',
    'yanis varoufakis': 'greece',
    'luis de guindos': 'spain',
    'maria luís albuquerque': 'portugal',
    'marco buti': 'european commission',
    'thomas wieser': 'efc', # economic and financial committee
    'declan costello': 'european commission', #dg ecfin
    'benoît cœuré': 'ecb',
    'poul mathias thomsen': 'imf',
    'greek representative': 'greece',
    'nikos theocarakis': 'greece',
    'hans jörg schelling': 'austria',
    'klaus regling': 'esm', # head of european stability mechanism
    'peter kažimír': 'slovakia',
    'dušan mramor': 'slovenia',
    'michel sapin': 'france',
    'translator': 'france',
    'pier carlo padoan': 'italy',
    'edward scicluna': 'malta',
    'rimantas šadžius': 'lithuania',
    'alexander stubb': 'finland', # from May 29
    'tooma': 'finland', # based on saying they have two and a half weeks until elections on April 1
    'johan van overtveldt': 'belgium',
    'harris georgiades': 'cyprus',
    'luis pierre': 'european commission',
    'jānis reirs': 'latvia',
    'luca antonio ricci': 'imf'
}

In [33]:
# dump to json
import json

json = json.dumps(speaker_to_entity)
with open('../data/euroleaks/name_to_entity.json', 'w') as f:
    f.write(json)

#### still don't know who these people represent...

In [34]:
for speaker in df.speaker.unique():
    if not ('speaker' in speaker or speaker in speaker_to_entity.keys()):
        print(speaker)
        print(df[df.speaker == speaker].wordcount.sum())
        print()
        pass

irina
27

nabil
87

tropa
254

martin
73

kian
165

male
639



In [35]:
df[df.speaker == 'nabil'].date.unique()

array(['2015-04-01 00:00:00'], dtype=object)

In [36]:
for row in df[df.speaker == 'nabil'].iterrows():
    print(row[1].speech)
    print()

[inaudible 00:45:06] It’s [Nabil 00:45:08]. Yes, uh, my questions, um, because, um, uh, we have been talking about Nikos measures which were submitted by the Greek side. Um, frankly they have been already news in a lot of m- media, so, um, it would be really very helpful to get this list of measures so that we actually know what we are talking about. And I understand from Nikos that he’s going to send to the members of the, the EWG which would be very helpful.

