In [1]:
import re
import time
import json
from pprint import pprint

import numpy as np
import pandas as pd
import spacy

In [2]:
leaks = pd.read_csv('../data/euroleaks/cleaned.csv')
comms = pd.read_csv('../data/communiques/cleaned.csv')

# collect all text into a single string
text = ' '.join(leaks.speech) + ' ' + ' '.join(comms.story)

In [3]:
# run spacy

nlp = spacy.load("en_core_web_sm", exclude=["ner"])

document = nlp(text)

# import euroleaks-specific stopwords
with open('../data/euroleaks/stopwords.json', 'r') as f:
    stopwords = json.load(f)

In [4]:
# tokenize, lemmatize, remove stopwords
words = [token.lemma_ for sentence in document.sents for token in sentence
            if token.pos_ in {'ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB'}
            and not token.is_stop
            and not token.text in stopwords['names']
            and not token.text in stopwords['artifacts']
            and len(token.text) > 1 # for punctuation (but also 'I')
        ]

- https://www.nltk.org/howto/collocations.html
- mi_like score: https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.association.NgramAssocMeasures.mi_like

# trigrams

In [5]:
from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures

finder = TrigramCollocationFinder.from_words(words)

# word might be highly correlated but very infrequent, ignore all with less than n occurances
finder.apply_freq_filter(3)

# not all collocations are useful: e.g. 'starting_point'
finder.apply_word_filter(lambda w: 'valid' in w or 'thank' in w)

tgm = TrigramAssocMeasures()
collocations = {trigram: pmi for trigram, pmi in finder.score_ngrams(tgm.mi_like)}

In [6]:
#collocations

In [7]:
trigram_colloc = sorted(finder.above_score(tgm.mi_like, 0.025))
pprint(trigram_colloc)

[('euro', 'working', 'group'),
 ('non', 'performing', 'loan'),
 ('successful', 'conclusion', 'review')]


In [8]:
def apply_trigram_colloc(s, set_colloc):
    res = s.lower()
    for b1,b2,b3 in set_colloc:
        res = res.replace(f'{b1} {b2} {b3}', f'{b1}_{b2}_{b3}')
    return res

In [9]:
words = apply_trigram_colloc(' '.join(words), trigram_colloc).split()

In [10]:
assert 'euro_working_group' in words and 'non_performing_loan' in words and 'successful_conclusion_review' in words

In [11]:
jsonized = json.dumps(trigram_colloc)
with open('../data/collocations/trigrams.json', 'w') as f:
    f.write(jsonized)

# bigrams

In [15]:
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

finder = BigramCollocationFinder.from_words(words)

# word might be highly correlated but very infrequent, ignore all with less than n occurances
finder.apply_freq_filter(3)

word_filter = [
    'thank',
    'start',
    'starting',
    'low',
    'high',
    'end',
    'stand',
    'negative',
    'gdp',
    'little',
    'floor'
]

finder.apply_word_filter(lambda w: w in word_filter)

finder.apply_ngram_filter(lambda w1,w2: w1.endswith('ly') or w1 == 'take')

bgm = BigramAssocMeasures()
collocations = {bigram: pmi for bigram, pmi in finder.score_ngrams(bgm.mi_like)}

In [16]:
bigram_colloc = sorted(finder.above_score(bgm.mi_like, 0.75))
pprint(bigram_colloc)

[('aide', 'memoire'),
 ('anti', 'corruption'),
 ('arm', 'length'),
 ('assistance', 'facility'),
 ('board', 'director'),
 ('brussel', 'group'),
 ('capital', 'control'),
 ('central', 'bank'),
 ('collective', 'bargaining'),
 ('common', 'ground'),
 ('current', 'arrangement'),
 ('debt', 'sustainability'),
 ('duration', 'mffa'),
 ('et', 'cetera'),
 ('euro', 'area'),
 ('financial', 'assistance'),
 ('financial', 'sector'),
 ('financial', 'stability'),
 ('govern', 'council'),
 ('greek', 'authority'),
 ('greek', 'government'),
 ('greek', 'people'),
 ('growth', 'friendly'),
 ('half', 'percent'),
 ('interest', 'rate'),
 ('labor', 'market'),
 ('member', 'state'),
 ('minimum', 'wage'),
 ('mission', 'chief'),
 ('monetary', 'union'),
 ('national', 'procedure'),
 ('press', 'conference'),
 ('primary', 'surplus'),
 ('prime', 'minister'),
 ('prior', 'action'),
 ('product', 'market'),
 ('quantitative', 'easing'),
 ('real', 'estate'),
 ('safety', 'net'),
 ('second', 'letter'),
 ('smp', 'bond'),
 ('state', '

In [18]:
#collocations

In [19]:
jsonized = json.dumps(bigram_colloc)
with open('../data/collocations/bigrams.json', 'w') as f:
    f.write(jsonized)

Auxiliary data inspection

In [20]:
def search_term(term):
    for i,row in leaks.iterrows():
        if term in row.speech:
            date = pd.to_datetime(row.date).strftime('%d/%m')
            print(f'{row.speaker} ({date}):')
            print(row.speech)
            print()

In [22]:
#search_term('floor')