In [11]:
import re
import time
import json
from pprint import pprint

import numpy as np
import pandas as pd
import spacy

In [12]:
leaks = pd.read_csv('../data/euroleaks/cleaned.csv')
comms = pd.read_csv('../data/communiques/cleaned.csv')

**REMARK**: in order to make sure that collocations are found separately (meaning once for document=eurpleaks and once for document=communiques), leaks and communiques are treated as two separate documents, but in order for the terms to be comparable, the collcoations are joined at the end.
Also, some artifacts were filtered out (e.g. "thank thank"), and for the rest a threshold value is set.

# Euroleaks

In [13]:
text = ' '.join(leaks.speech)

In [14]:
t = time.time()

nlp = spacy.load("en_core_web_sm", exclude=["ner"])

print(f'Time taken to load the spacy model: {round((time.time() - t) / 60, 2)} mins')

Time taken to load the spacy model: 0.01 mins


In [15]:
t = time.time()

document = nlp(text)

print(f'Time taken to run the spacy model: {round((time.time() - t) / 60, 2)} mins')

Time taken to run the spacy model: 0.4 mins


In [16]:
# import euroleaks-specific stopwords
with open('../data/euroleaks/stopwords.json', 'r') as f:
    stopwords = json.load(f)

In [22]:
# tokenize, lemmatize, remove stopwords
words = [token.lemma_.lower() for sentence in document.sents for token in sentence
            if token.pos_ in {'ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB'}\
            and not token.is_stop\
            and not token.lower_ in stopwords['names']\
            and not token.lower_ in stopwords['disfluency']\
            and not token.lower_ in stopwords['courtesy']\
            and len(token.lemma_)>1
        ]

- https://www.nltk.org/howto/collocations.html
- mi_like score: https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.association.NgramAssocMeasures.mi_like

## trigrams

In [23]:
from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures

finder = TrigramCollocationFinder.from_words(words)

# word might be highly correlated but very infrequent, ignore all with less than n occurances
finder.apply_freq_filter(3)

# not all collocations are useful: e.g. 'starting_point'
finder.apply_word_filter(lambda w: 'valid' in w or 'thank' in w or 'lack' in w or 'particularly' in w)

tgm = TrigramAssocMeasures()
collocations = {trigram: pmi for trigram, pmi in finder.score_ngrams(tgm.mi_like)}

In [24]:
#collocations

In [25]:
leaks_trigram_colloc = sorted(finder.above_score(tgm.mi_like, 0.006))
pprint(leaks_trigram_colloc)

[('debt', 'sustainability', 'analysis'),
 ('euro', 'working', 'group'),
 ('international', 'monetary', 'fund'),
 ('low', 'interest', 'rate'),
 ('master', 'financial', 'assistance'),
 ('non', 'performing', 'loan'),
 ('sign', 'dotted', 'line'),
 ('successful', 'conclusion', 'review')]


In [26]:
def apply_trigram_colloc(s, set_colloc):
    res = s.lower()
    for b1,b2,b3 in set_colloc:
        res = res.replace(f'{b1} {b2} {b3}', f'{b1}_{b2}_{b3}')
    return res

In [27]:
words = apply_trigram_colloc(' '.join(words), leaks_trigram_colloc).split()

In [28]:
assert 'euro_working_group' in words and 'successful_conclusion_review' in words

## bigrams

In [33]:
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

finder = BigramCollocationFinder.from_words(words)

# word might be highly correlated but very infrequent, ignore all with less than n occurances
finder.apply_freq_filter(3)

word_filter = [
    'thank',
    'start',
    'starting',
    'low',
    'high',
    'end',
    'stand',
    'negative',
    'gdp',
    'little',
    'floor',
    'think',
    'staff',
    'facility'
]

finder.apply_word_filter(lambda w: w in nlp.Defaults.stop_words or w in stopwords['names'] or w in stopwords['disfluency'] or w in stopwords['courtesy'] or w in word_filter)

finder.apply_ngram_filter(lambda w1,w2: w1.endswith('ly') or w1 == 'take')

bgm = BigramAssocMeasures()
collocations = {bigram: pmi for bigram, pmi in finder.score_ngrams(bgm.mi_like)}

In [34]:
leaks_bigram_colloc = sorted(finder.above_score(bgm.mi_like, 0.74))
pprint(leaks_bigram_colloc)

[('aide', 'memoire'),
 ('anti', 'corruption'),
 ('arm', 'length'),
 ('brussels', 'group'),
 ('capital', 'control'),
 ('central', 'bank'),
 ('collective', 'bargaining'),
 ('common', 'ground'),
 ('et', 'cetera'),
 ('financial', 'stability'),
 ('govern', 'council'),
 ('greek', 'authority'),
 ('greek', 'government'),
 ('greek', 'people'),
 ('growth', 'friendly'),
 ('half', 'percent'),
 ('interest', 'rate'),
 ('labor', 'market'),
 ('maximum', 'flexibility'),
 ('member', 'state'),
 ('minimum', 'wage'),
 ('mission', 'chief'),
 ('primary', 'surplus'),
 ('prime', 'minister'),
 ('prior', 'action'),
 ('product', 'market'),
 ('quantitative', 'easing'),
 ('real', 'estate'),
 ('safety', 'net'),
 ('second', 'letter'),
 ('smp', 'bond'),
 ('structural', 'reform'),
 ('technical', 'team'),
 ('uncharted', 'territory'),
 ('united', 'states')]


In [35]:
#collocations

# Communiques

In [45]:
text = ' '.join(comms.story)

In [46]:
# run spacy

nlp = spacy.load("en_core_web_sm", exclude=["ner"])

document = nlp(text)

In [47]:
# tokenize, lemmatize, remove stopwords
words = [token.lemma_.lower() for sentence in document.sents for token in sentence
            if token.pos_ in {'ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB'}\
            and not token.is_stop\
            and len(token.lemma_)>1
        ]

- https://www.nltk.org/howto/collocations.html
- mi_like score: https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.association.NgramAssocMeasures.mi_like

## trigrams

In [48]:
from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures

finder = TrigramCollocationFinder.from_words(words)

# word might be highly correlated but very infrequent, ignore all with less than n occurances
finder.apply_freq_filter(3)

# not all collocations are useful: e.g. 'starting_point'
finder.apply_word_filter(lambda w: 'valid' in w or 'thank' in w)

tgm = TrigramAssocMeasures()
collocations = {trigram: pmi for trigram, pmi in finder.score_ngrams(tgm.mi_like)}

In [49]:
#collocations

In [50]:
comms_trigram_colloc = sorted(finder.above_score(tgm.mi_like, 0.125))
pprint(comms_trigram_colloc)

[('low', 'interest', 'rate'), ('successful', 'conclusion', 'review')]


In [51]:
def apply_trigram_colloc(s, set_colloc):
    res = s.lower()
    for b1,b2,b3 in set_colloc:
        res = res.replace(f'{b1} {b2} {b3}', f'{b1}_{b2}_{b3}')
    return res

In [52]:
words = apply_trigram_colloc(' '.join(words), comms_trigram_colloc).split()

## bigrams

In [53]:
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

finder = BigramCollocationFinder.from_words(words)

# word might be highly correlated but very infrequent, ignore all with less than n occurances
finder.apply_freq_filter(3)

word_filter = [
    'starting',
    'stand',
    'evening',
    'mission',
    'main',
    'like',
    'kind',
    'obligation'
]

finder.apply_word_filter(lambda w: w in word_filter)

finder.apply_ngram_filter(lambda w1,w2: w1.endswith('ly') or w1 == 'take')

bgm = BigramAssocMeasures()
collocations = {bigram: pmi for bigram, pmi in finder.score_ngrams(bgm.mi_like)}

In [54]:
comms_bigram_colloc = sorted(finder.above_score(bgm.mi_like, 1.1))
pprint(comms_bigram_colloc)

[('banking', 'union'),
 ('common', 'ground'),
 ('debt', 'sustainability'),
 ('duration', 'mffa'),
 ('euro', 'area'),
 ('european', 'semester'),
 ('greek', 'authority'),
 ('greek', 'government'),
 ('member', 'states'),
 ('monetary', 'union'),
 ('national', 'procedure'),
 ('press', 'conference'),
 ('prior', 'action'),
 ('state', 'play'),
 ('structural', 'reform'),
 ('track', 'record')]


In [55]:
#collocations

# Join collocations

In [56]:
print('Trigrams that are in Euroleaks, but not in Communiques:')
for colloc in leaks_trigram_colloc:
    if colloc not in comms_trigram_colloc:
        print('\t', colloc)
        
print('\nTrigrams that are in Communiques, but not in Euroleaks:')
for colloc in comms_trigram_colloc:
    if colloc not in leaks_trigram_colloc:
        print('\t', colloc)

Trigrams that are in Euroleaks, but not in Communiques:
	 ('debt', 'sustainability', 'analysis')
	 ('euro', 'working', 'group')
	 ('international', 'monetary', 'fund')
	 ('master', 'financial', 'assistance')
	 ('non', 'performing', 'loan')
	 ('sign', 'dotted', 'line')

Trigrams that are in Communiques, but not in Euroleaks:


In [57]:
trigram_colloc = set(leaks_trigram_colloc + comms_trigram_colloc)

In [58]:
print('Bigrams that are in Euroleaks, but not in Communiques:')
for colloc in leaks_bigram_colloc:
    if colloc not in comms_bigram_colloc:
        print('\t', colloc)
        
print('\nBigrams that are in Communiques, but not in Euroleaks:')
for colloc in comms_bigram_colloc:
    if colloc not in leaks_bigram_colloc:
        print('\t', colloc)

Bigrams that are in Euroleaks, but not in Communiques:
	 ('aide', 'memoire')
	 ('anti', 'corruption')
	 ('arm', 'length')
	 ('brussels', 'group')
	 ('capital', 'control')
	 ('central', 'bank')
	 ('collective', 'bargaining')
	 ('et', 'cetera')
	 ('financial', 'stability')
	 ('govern', 'council')
	 ('greek', 'people')
	 ('growth', 'friendly')
	 ('half', 'percent')
	 ('interest', 'rate')
	 ('labor', 'market')
	 ('maximum', 'flexibility')
	 ('member', 'state')
	 ('minimum', 'wage')
	 ('mission', 'chief')
	 ('primary', 'surplus')
	 ('prime', 'minister')
	 ('product', 'market')
	 ('quantitative', 'easing')
	 ('real', 'estate')
	 ('safety', 'net')
	 ('second', 'letter')
	 ('smp', 'bond')
	 ('technical', 'team')
	 ('uncharted', 'territory')
	 ('united', 'states')

Bigrams that are in Communiques, but not in Euroleaks:
	 ('banking', 'union')
	 ('debt', 'sustainability')
	 ('duration', 'mffa')
	 ('euro', 'area')
	 ('european', 'semester')
	 ('member', 'states')
	 ('monetary', 'union')
	 ('nation

In [29]:
bigram_colloc = set(leaks_bigram_colloc + comms_bigram_colloc)

## dump to json

In [35]:
jsonized = json.dumps(sorted(trigram_colloc))
with open('../data/collocations/trigrams.json', 'w') as f:
    f.write(jsonized)

In [36]:
jsonized = json.dumps(sorted(bigram_colloc))
with open('../data/collocations/bigrams.json', 'w') as f:
    f.write(jsonized)

# Auxiliary data inspection

In [37]:
def search_term(term):
    for i,row in leaks.iterrows():
        if term in row.speech:
            date = pd.to_datetime(row.date).strftime('%d/%m')
            print(f'{row.speaker} ({date}):')
            print(row.speech)
            print()

In [41]:
#search_term('Alex')