# Stopwords

In [1]:
import numpy as np
import pandas as pd
import re
import json

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/euroleaks/cleaned.csv')

## make stopwords out of speaker names

In [3]:
with open('../data/euroleaks/amend_names.json', 'r') as f:
    amend_names = json.load(f)
    
# make stopwords out of names
stopnames = []

for names in amend_names.values():
    for name in names:
        if not re.search('\[.*\]', name):
            stopnames += name.split(' ')
            
for name in df.speaker.unique():
    if 'speaker' not in name:
        stopnames += name.split(' ')

stopnames = set(stopnames)
        
stopnames.remove('greek')
stopnames.remove('representative')
stopnames.remove('de')
stopnames.remove('van')
stopnames.add('alex')
stopnames.add('dusan')
stopnames.add('janis')
stopnames.add('joreon')

In [4]:
#stopnames

In [5]:
# back of envelope calculation for how many stopnames there are
np.sum(pd.Series(' '.join(df.speech).lower().split()).apply(lambda s:
    re.sub(r'[^\w\s]','',s) in stopnames
))

761

## disfluency

In [6]:
disfluency = set([
    'erm', # synonym for hmm in https://euroleaks.diem25.org/leaks/mar17ewg/
    'um',
    'uh'
])

In [7]:
# back of envelope calculation for how many disfluency occurrences there are
np.sum(pd.Series(' '.join(df.speech).lower().split()).apply(lambda s:
    re.sub(r'[^\w\s]','',s) in disfluency
))

3427

## common courtesy words

In [8]:
courtesy = set([
    'thank',
    'colleague',
    'comment',
    'ask',
    'let',
    'point',
    'say',
    'think'
])

In [9]:
# back of envelope calculation for how many courtesy occurrences there are
np.sum(pd.Series(' '.join(df.speech).lower().split()).apply(lambda s:
    re.sub(r'[^\w\s]','',s) in courtesy
))

1347

## dump to json

In [10]:
stopwords = {
    'names': list(stopnames),
    'disfluency': list(disfluency),
    'courtesy': list(courtesy)
}

In [11]:
import json

json = json.dumps(stopwords)
with open('../data/euroleaks/stopwords.json', 'w') as f:
    f.write(json)