# Stopwords

In [1]:
import numpy as np
import pandas as pd
import re
import json

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/euroleaks/cleaned.csv')

## make stopwords out of speaker names

In [3]:
with open('../data/euroleaks/amend_names.json', 'r') as f:
    amend_names = json.load(f)
    
# make stopwords out of names
stopnames = []

for names in amend_names.values():
    for name in names:
        if not re.search('\[.*\]', name):
            stopnames += name.split(' ')
            
for name in df.speaker.unique():
    if 'speaker' not in name:
        stopnames += name.split(' ')

stopnames = set(stopnames)
        
stopnames.remove('greek')
stopnames.remove('representative')
stopnames.remove('de')
stopnames.remove('van')
stopnames.add('alex')
stopnames.add('dusan')

In [4]:
stopnames

{'albuquerque',
 'alex',
 'alexander',
 'antonio',
 'benoit',
 'benoît',
 'buti',
 'carlo',
 'christine',
 'costello',
 'couré',
 'cœuré',
 'declan',
 'dijsselbloem',
 'draghi',
 'dusan',
 'dušan',
 'edward',
 'georgiades',
 'guindos',
 'hans',
 'harris',
 'irana',
 'irina',
 'jeroen',
 'johan',
 'jörg',
 'jānis',
 'kazimir',
 'kažimír',
 'kian',
 'klaus',
 'lagarde',
 'luca',
 'luis',
 'luís',
 'male',
 'marco',
 'maria',
 'mario',
 'martin',
 'mathias',
 'michael',
 'michel',
 'moscovici',
 'mramor',
 'nabil',
 'nikos',
 'noonan',
 'overtveldt',
 'padoan',
 'paul',
 'peter',
 'pier',
 'pierre',
 'poul',
 'regling',
 'reirs',
 'ricci',
 'rimantas',
 'sapin',
 'schauble',
 'schelling',
 'schäuble',
 'scicluna',
 'steffen',
 'stubb',
 'theocarakis',
 'thomas',
 'thomsen',
 'tooma',
 'translator',
 'tropa',
 'varoufakis',
 'wieser',
 'wolfgang',
 'yanis',
 'šadžius'}

In [5]:
# back of envelope calculation for how many stopnames there are
np.sum(pd.Series(' '.join(df.speech).lower().split()).apply(lambda s:
    re.sub(r'[^\w\s]','',s) in stopnames
))

758

## transription artifacts to remove

In [6]:
artifacts = [
    'erm', # synonym for hmm in https://euroleaks.diem25.org/leaks/mar17ewg/
]

In [7]:
# back of envelope calculation for how many artifacts there are
np.sum(pd.Series(' '.join(df.speech).lower().split()).apply(lambda s:
    re.sub(r'[^\w\s]','',s) == 'erm'
))

273

## weak speaker discriminants
These are words that, when speakres are considered as documents, have high document frequency. In other words (hehe), these are words that all speakers use and therefore we can put them to little use for distinguishing between speakers.

In [8]:
import spacy

nlp = spacy.load('en_core_web_sm')

spacy_stopwords = list(nlp.Defaults.stop_words)

In [9]:
grouped = df.drop(columns=['timestamp','date']).groupby('speaker').apply(lambda s: ' '.join(s.speech))

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    stop_words=list(stopnames)+artifacts+spacy_stopwords
) # does tokenozation under the hood

X = vectorizer.fit_transform(grouped)



In [11]:
X.shape

(59, 4591)

In [12]:
# most frequent words by document frequency

words = np.array(vectorizer.get_feature_names())
freqs = np.mean(X.toarray() > 0, axis=0)

sort_ix = np.argsort(freqs)[::-1] # descendingly
words = words[sort_ix]
freqs = freqs[sort_ix]

In [13]:
list(words[:10])

['uh',
 'thank',
 'um',
 'greek',
 'think',
 'said',
 'point',
 'institutions',
 'need',
 'know']

In [14]:
pd.Series(freqs).describe(percentiles=[.25,.5,.75,.90,.95,.99])

count    4591.000000
mean        0.052409
std         0.070798
min         0.016949
25%         0.016949
50%         0.016949
75%         0.050847
90%         0.118644
95%         0.186441
99%         0.355932
max         0.813559
dtype: float64

In [15]:
np.sum(freqs > 0.5)

14

In [16]:
np.sum(freqs > 0.1)

648

In [17]:
# back of envelope calculation for how many stopnames there are
np.mean(pd.Series(' '.join(df.speech).split()).apply(lambda s:
    re.sub(r'[^\w\s]','',s) in words[freqs > 0.1]
))

0.24737143738841008

In [18]:
stopwords = {
    'names': list(stopnames),
    'artifacts': list(artifacts),
    'weak_speaker_discriminants_0.5': list(words[freqs > 0.5]),
    'weak_speaker_discriminants_0.1': list(words[freqs > 0.1])
}

In [19]:
# dump to json
import json

json = json.dumps(stopwords)
with open('../data/euroleaks/stopwords.json', 'w') as f:
    f.write(json)