# Preprocessing
Preprocessing needs to be done appropriately for each analysis separately. This notebook serves as a general template.
Also, I use it to compute an approximate number of stopowords removed and tokens after preprocessing is applied.

In [38]:
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Euroleaks

In [39]:
# read data
df = pd.read_csv('../data/euroleaks/cleaned.csv')

# stopwords
with open('../data/euroleaks/stopwords.json', 'r') as f:
    stopwords = json.load(f)
    
# collocations
def apply_trigram_colloc(s, set_colloc):
    res = s.lower()
    for b1,b2,b3 in set_colloc:
        res = res.replace(f'{b1} {b2} {b3}', f'{b1}_{b2}_{b3}')
    return res

def apply_bigram_colloc(s, set_colloc):
    res = s.lower()
    for b1,b2 in set_colloc:
        res = res.replace(f'{b1} {b2}', f'{b1}_{b2}')
    return res

with open('../data/collocations/trigrams.json', 'r') as f:
    trigram_colloc = json.load(f)

with open('../data/collocations/bigrams.json', 'r') as f:
    bigram_colloc = json.load(f)

In [40]:
# join speech of consecutive rows with same speaker

df_squeezed = pd.DataFrame(columns=['speaker','speech','date'])

previous_speaker = None
speech = None
previous_date = None

for index, data in df.iterrows():
    
    if not previous_speaker and not speech and not previous_date:
        previous_speaker = data.speaker
        speech = data.speech
        previous_date = data.date
    elif data.speaker == previous_speaker and previous_date == data.date:
        speech = ' '.join((speech, data.speech))
    else:
        df_squeezed.loc[len(df_squeezed.index)] = [
            previous_speaker,
            speech,
            previous_date
        ]
        previous_speaker = data.speaker
        speech = data.speech
        previous_date = data.date

In [41]:
with open('../data/euroleaks/name_to_entity.json', 'r') as f:
    speaker_to_entity = json.load(f)
    
# add column for entity, label unassigned as Unidentified
df_squeezed['entity'] = df_squeezed.speaker.apply(lambda s: speaker_to_entity[s] if s in speaker_to_entity.keys() else 'Unidentified')

In [42]:
df_squeezed.head()

Unnamed: 0,speaker,speech,date,entity
0,jeroen dijsselbloem,… of your responses or questions. And can I fi...,2015-02-24 00:00:00,EG President
1,speaker 2,"Uh, yes, uh, thank you, Jeroen. Well, uh, comm...",2015-02-24 00:00:00,Unidentified
2,michael noonan,Michael Noonan.,2015-02-24 00:00:00,Ireland
3,speaker 2,"Uh, it is therefore regrettable that, uh- … th...",2015-02-24 00:00:00,Unidentified
4,pierre moscovici,"Um, okay colleagues. Um, in general I would sa...",2015-02-24 00:00:00,European Commission


In [43]:
print(f'There are {df.speech.size} original speech entries, but after "squeezing" there are {df_squeezed.speech.size} speeches.')

There are 1489 original speech entries, but after "squeezing" there are 736 speeches.


### spacy

In [44]:
import spacy

nlp = spacy.load('en_core_web_sm', exclude=["ner"])

In [45]:
def filter_token(token):
    return token.pos_ in {'ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB'}\
        and not token.is_stop\
        and not token.lower_ in stopwords['names']\
        and not token.lower_ in stopwords['disfluency']\
        and not token.lemma_ in stopwords['courtesy']\
        and len(token.lemma_) > 1

In [46]:
text = ' '.join(df_squeezed.speech.values)

In [47]:
tokenized_text = nlp(text)

In [48]:
print(f'Tokens after tokenization: {len(tokenized_text)}')

Tokens after tokenization: 108345


In [49]:
words = [token.lemma_.lower() for token in tokenized_text if filter_token(token)]

In [51]:
# apply collocations
words = apply_bigram_colloc(apply_trigram_colloc(' '.join(words), trigram_colloc), bigram_colloc).split()

In [52]:
print(f'Tokens after filtering on POS, removing stopwords and punctuation, and joining collocations: {len(words)}')

Tokens after filtering on POS, removing stopwords and punctuation, and joining collocations: 28766


## Communiques

In [54]:
communiques = pd.read_csv('../data/communiques/cleaned.csv')

In [56]:
communiques.head()

Unnamed: 0,date,title,story
0,2015-02-12,Remarks by Jeroen Dijsselbloem at the press co...,"Good evening. Today, as you well know, we had ..."
1,2015-02-16,Remarks by Jeroen Dijsselbloem at the press co...,Good evening everyone and thanks for joining t...
2,2015-02-20,Eurogroup statement on Greece,The Eurogroup reiterates its appreciation for ...
3,2015-02-20,Remarks by Jeroen Dijsselbloem at the press co...,Good evening and welcome to this press confere...
4,2015-02-24,Eurogroup statement on Greece,The Eurogroup today discussed the first list o...


In [57]:
print(f'There are {communiques.title.size} press release.')

There are 18 press release.


### spacy

In [58]:
def filter_token(token):
    return token.pos_ in {'ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB'}\
        and not token.is_stop\
        and len(token.lemma_) > 1

In [59]:
text = ' '.join(communiques.story.values)

In [60]:
tokenized_text = nlp(text)

In [61]:
print(f'Tokens after tokenization: {len(tokenized_text)}')

Tokens after tokenization: 9574


In [62]:
words = [token.lemma_.lower() for token in tokenized_text if filter_token(token)]

In [63]:
# apply collocations
words = apply_bigram_colloc(apply_trigram_colloc(' '.join(words), trigram_colloc), bigram_colloc).split()

In [64]:
print(f'Tokens after filtering on POS, removing stopwords and punctuation, and joining collocations: {len(words)}')

Tokens after filtering on POS, removing stopwords and punctuation, and joining collocations: 3649
