## scrape Euroleaks

In [333]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

In [334]:
# each group has a different html structure
leaks = {1: [
            'feb24eurogroup/',
            'mar17ewg/',
            'apr1ewg/',
            'apr24eurogroup/',
            'may11eurogroup/',
            'jun18eurogroup/',
            'jun22eurogroup/',
            'jun24eurogroup/',
            'jun25eurogroup-part1/',
            'jun27eurogroup/',
            'jun30eurogroup/'
            ],
         2: ['jul1eurogroup/'],
         3: ['jun25eurogroup-part-2/']
        }

URLs = {
    group: [f'https://euroleaks.diem25.org/leaks/{leak}'
        for leak in urls_] for group, urls_ in leaks.items() 
}

In [335]:
def format_timestamp(s):
    if re.compile('\(\d{2}:\d{2}:\d{2}\)').match(s):
        return pd.to_datetime(s, format='(%H:%M:%S)')
    elif re.compile('\(\d{2}:\d{2}\)').match(s):
        return pd.to_datetime(s, format='(%M:%S)', errors='coerce')
    else:
        return np.datetime64('NaT')

In [329]:
dfs = []

for group, URLs_ in URLs.items():
    
    for URL in URLs_:
    
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, 'html.parser')

        date = soup.find(class_='post-date').find(class_='meta-text').text.strip()
    
        col_speaker = []
        col_speech = []
        col_timestamp = []
        
        if group == 1:

            interventions = soup.find(id='transcript_without_translation').find_all(class_='intervention')

            for intervention in interventions:
                # get speaker name
                col_speaker.append(intervention.find(class_='speaker-name').text)
                # get speech
                col_speech.append(intervention.find(class_='speech').text)
                # get timestamp
                timestamp_ = intervention.find(class_='timestamp')
                col_timestamp.append(format_timestamp(timestamp_.text) if timestamp_ else np.datetime64('NaT'))

        elif group == 2:
            
            paragraphs = soup.find(id='transcript_without_translation').find_all('p')
            
            last_speaker = np.nan

            for p in paragraphs: 
    
                have_speaker = False
                speech = ''
    
                timestamp_ = p.find(class_='timestamp')
    
                for line in p.text.split('\n'):
                    if line.endswith(':'):
                        if have_speaker:
                            col_speaker.append(last_speaker)
                            col_speech.append(speech.strip())
                            col_timestamp.append(format_timestamp(timestamp_.text) if timestamp_ else np.datetime64('NaT'))
                        else:
                            have_speaker = not have_speaker
                
                        last_speaker = re.search('^(\w* *)*', line).group()
                        speech = '' 
                    else:
                        speech = '\n'.join((speech, line.strip()))
            
                col_speaker.append(last_speaker)
                col_speech.append(speech.strip())
                col_timestamp.append(format_timestamp(timestamp_.text) if timestamp_ else np.datetime64('NaT'))

        elif group == 3:
            
            paragraphs = soup.find(id='transcript_without_translation').find_all('p')
                
            last_speaker = np.nan
            timestamp = np.datetime64('NaT')

            for p in paragraphs:
    
                if p.text.endswith(':'):
                    last_speaker = re.search('^(\w* *)*', p.text).group()
                    timestamp = re.search('\(\d{2}:\d{2}\)', p.text).group() if re.search('\(\d{2}:\d{2}\)', p.text) else np.nan
        
                else:
                    col_speaker.append(last_speaker)
                    col_speech.append(p.text)
                    if not pd.isnull(timestamp):
                        col_timestamp.append(format_timestamp(timestamp))
                        timestamp = np.datetime64('NaT')
                    else:
                        col_timestamp.append(np.datetime64('NaT'))
                
                
        dfs.append(pd.DataFrame({
            'speaker': col_speaker,
            'speech': col_speech,
            'timestamp': col_timestamp,
            'date': pd.to_datetime(np.repeat(date, len(col_timestamp)))
            }))

df = pd.concat(dfs, ignore_index=True)

In [338]:
df.head()

Unnamed: 0,speaker,speech,timestamp,date
0,Jeroen Dijsselbloem,… of your responses or questions. And can I fi...,1900-01-01 00:00:00,2015-02-24
1,Speaker 2,"Uh, yes, uh, thank you, Jeroen. Well, uh, comm...",1900-01-01 00:00:10,2015-02-24
2,Michael Noonan,Michael Noonan.,1900-01-01 00:01:27,2015-02-24
3,Speaker 2,"Uh, it is therefore regrettable that, uh-",1900-01-01 00:01:29,2015-02-24
4,Speaker 3,Has entered the conference.,1900-01-01 00:01:33,2015-02-24


In [339]:
df.tail()

Unnamed: 0,speaker,speech,timestamp,date
1517,Speaker 2,[inaudible].Jeroen Dijsselb,NaT,2015-06-25
1518,Speaker 2,"That will be my proposal for the coming hours,...",NaT,2015-06-25
1519,Yanis Varoufakis,They had to wait for half an hour for this ver...,1900-01-01 00:08:02,2015-06-25
1520,Speaker 4,Hmmm?,NaT,2015-06-25
1521,Yanis Varoufakis,They had to wait for half an hour for this ver...,1900-01-01 00:08:05,2015-06-25


## clean Euroleaks

### todo
- need to check that timestamp is monotonically increasing
- derive speech duration from timestamp (after checking and correcting if need be)
- fill missing timestamps by taking time between the next ones, then allocating to ones in between with missing based on word/character count
- check speakers (then maybe exclude "Computer")
- clean "[inaudible]", "[crosstalk]", "[foreign language]", also (*)
- check for special character to see for other things that need to be potentially cleaned


In [343]:
df.speaker.unique()

array(['Jeroen Dijsselbloem', 'Speaker 2', 'Michael Noonan', 'Speaker 3',
       'Pierre Moscovici', 'Mario Draghi', 'Wolfgang Schäuble',
       'Christine Lagarde', 'Yanis Varoufakis', 'Yanis [not Varoufakis]',
       'Luis de Guindos', 'Maria Luís', 'Marco Buti', 'Thomas Wieser',
       'Declan Costello', 'Computer', 'Speaker 5', 'Benoit Couré',
       'Paul Thomsen', 'Greek Representative', 'Speaker 9', 'Thomas',
       'Speaker 10', 'Speaker 1', 'Benoit Cœuré', 'Nikos Theocarakis',
       'Nikos Theocarakis ', 'Irina', 'Irana', 'Nabil', 'Speaker 4',
       'Tooma', 'Tropa', 'Marco Buti ', 'Speaker 6', 'Ricci', 'Speaker 7',
       'Speaker 8', 'Hans', 'Speaker 11', 'Speaker 12', 'Speaker 13',
       'Speaker 14', 'Paul', 'Paul:', 'Klaus Regling',
       'Yanis Varoufakis:', 'Peter Kažimír', 'Martin',
       'Hans Jörg Schelling', 'Luis de Guindos ', 'Dušan Mramor',
       'Michel Sapin', 'Pier Carlo Padoan', 'Speaker 19:',
       'Edward Scicluna', 'Rimantas Šadžius', 'Jeroen Dijsse

## auxiliary data

- map speaker to function (finance minister of country, ECB, IMF)
- map date to total duration of call, to be able to get speech duration for last speech entry for a date