In [3]:
import os, re, logging
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from collections import defaultdict
from nltk.stem.snowball import SnowballStemmer
import seaborn as sns

In [9]:
BASE_DICT = '/Users/anne/repos/RPA/resources/'
FILENAME_DICT = '20140718_dutchdictionary.txt'
PATH_TO_DATA = '~/surfdrive/uva/projects/RPA_KeepingScore/pickle_files/'

MINNUMBERMATCHES = 2 # min number of times a keyword should occur for a topic to be present

stemmer = SnowballStemmer("dutch")

In [None]:
def label_topic(x):
    if x == '1':
        return 'Macro-economie en belastingen'
    if x == '2':
        return 'Burgerrechten en vrijheden'
    if x == '3':
        return 'Gezondheid'
    if x == '4':
        return 'Landbouw en Visserij'
    if x == '5':
        return 'Arbeid'
    if x == '6':
        return 'Onderwijs'
    if x == '7':
        return 'Milieu'
    if x == '8':
        return 'Energiebeleid'
    if x == '9':
        return 'Immigratie en integratie'
    if x == '10':
        return 'Verkeer en vervoer'
    if x == '11':
        return 'Unkown'
    if x == '12':
        return 'Justitie, Rechtspraak, Criminaliteit'
    if x == '13':
        return 'sociale Zaken'
    if x == '14':
        return 'Gemeenschapsontwikkeling, huisvestingsbeleid en stedelijke planning'
    if x == '15':
        return 'Ondernemingen, Bankwezen en binnenlandse handel '
    if x == '16':
        return 'Defensie'
    if x == '17':
        return 'Wetenschappelijk onderzoek, technologie en communicatie'
    if x == '18':
        return 'Buitenlandse handel'
    if x == '19':
        return 'Buitenlandse zaken en ontwikkelingssamenwerking'
    if x == '20':
        return 'Functioneren democratie en openbaar bestuur'
    if x == '21':
        return 'Ruimtelijke ordening, publiek natuur- en waterbeheer'
    if x == '22':
        return 'Unkown 2'
    if x == '23':
        return 'Kunst, cultuur en entertainment'
    if x == '24':
        return '*** Gemeentelijk en provinciaal bestuur'
    if x == '29':
        return '*** Sport'
    if x == '00':
        return 'Toegevoegde codes voor media'
    
def parse_xml():
    '''reads file with topic numbers + words and parses the title'''
    
    words = []
    topics = []
    for l in [line.strip() for line in open(os.path.join(BASE_DICT , FILENAME_DICT)).readlines() if len(line)>1] :
        topics_words = defaultdict(list)
        if l.startswith('<cnode'):
            wordlist = []
            topics_l = list(re.sub('">|"|t', '', l.split('=')[1]) )
            if len(topics_l) == 2 :
                final_topic = "".join(topics_l)
            elif len(topics_l) == 3 :
                final_topic = topics_l[0]
            elif len(topics_l) == 4 :
                final_topic = "".join( topics_l[:2] )
        elif l.startswith('<pnode'):
            word = re.sub('">|</pnode>|"', '', l.split('=')[1]) 
            words.append(word)
            topics.append(final_topic)
    return words, topics

def get_dict():
    'returns a dict with keys = topic, values = words '
    
    words, topics = parse_xml()
    d = defaultdict(list)
    for topic, word in zip(topics, words):
        topic_name = label_topic(topic)
        d[topic_name].append(word)
    return d

def get_stemmed_dict():
    stemmer = SnowballStemmer("dutch")
    d = get_dict()
    stemmed_dictionary = {}
    for topic, words in d.items():
        stemmed_dictionary[topic] = [ stemmer.stem(w) for w in words ]
    return stemmed_dictionary

def get_raw_data():
    df = pd.read_pickle(PATH_TO_DATA + 'VK_TEL')
    df = df[['text_title', 'main_topic', 'main_topic_label']]
    df.rename(columns={'text_title' : 'text', 'main_topic' : 'topic'}, inplace = True)
    df['type'] = 'newspaper'
    
    df2 = pd.read_pickle(PATH_TO_DATA + 'kamervragen')
    df2 = df2[['questions', 'main_topic', 'main_topic_label']]
    df2.rename(columns={'questions' : 'text', 'main_topic' : 'topic'}, inplace = True)
    df2['type'] = 'parlementary question'
    
    df = df.append(df2)
    
    df['origin'] = 'RPA'
    df.reset_index(drop=True, inplace=True)
    df['documentnr'] = df.index
    logger.info("Appended the kamervragen dataset to the newspaper dataset, resulting in a df with a len of {}".format(len(df)))
    return df

def get_bjorn_data():
    df = pd.read_pickle(PATH_TO_DATA + 'dataset_burscher.pkl')
    df['origin'] = 'Bjorn'
    df['type'] = 'newspaper'
    return df
    
def get_recode_data():
    '''match data according to coding of Bjorn '''
    
    df = get_raw_data()
    a = ['Buitenlandse handel' , 'Kunst, cultuur en entertainment' ,  'Ruimtelijke ordening, publiek natuur- en waterbeheer', 'Toegevoegde codes voor media', None] 
    b = ['Overige' ] * len(a)
    overige_cat = dict(zip(a,b))
    df['main_topic_label'].replace(overige_cat, inplace = True)
    logger.info("Recoded data according to Bjorn's dataset. New topic categories are: {}".format(df['main_topic_label'].unique()))
    return df

def get_data():
    df = get_recode_data().append(get_bjorn_data())
    df.reset_index(drop=True, inplace=True)
    df['documentnr'] = df.index
    logger.info("Retrieved the recoded dataset, merged with Bjorn's data, containing {} cases".format(len(df)))
    return df

def stem_sentences(sentence):
    try:
        tokens = sentence.split()
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        return ' '.join(stemmed_tokens)
    except:
        return 'NAN'

def return_stemmed_text_columns():
    df = get_data()
    logger.info("Start stemming....")
    df['stemmed_text'] = df.text.apply(stem_sentences)
    return df

def dictionary_topics():
    df1 = return_stemmed_text_columns()
    result = []
    documentnr = -1
    for document in df1['text']:
        documentnr += 1
        topics_per_document = {}
        d = get_dict()
        logger.info("Start word search....")
        for topic, words in d.items():
            match = [x for x in words if x in document.lower().split(' ')]
            doc_string = document.lower().split(' ')
            index = [doc_string.index(word) for word in match ]
            try:
                index_smallest = min(index)
            except:
                index_smallest = np.nan

            topics_per_document = {'documentnr' : documentnr, 
                                    'topic_label_dictionary': topic, 
                                    'index_words' : index, 
                                    'smallest_index' : index_smallest,
                                    'len matches' : len(match),
                                    'words matches' : match  ,
                                    'text' : document.lower()}
            result.append(topics_per_document)
    return result

def dictionary_topics_stemmed():
    df1 = return_stemmed_text_columns()
    result = []
    documentnr = -1
    for document in df1['stemmed_text']:
        documentnr += 1
        topics_per_document = {}
        d = get_stemmed_dict()
        logger.info("Start word search on stemmed text...")
        for topic, words in d.items():
            match = [x for x in words if x in document.lower().split(' ')]
            doc_string = document.lower().split(' ')
            index = [doc_string.index(word) for word in match ]
            try:
                index_smallest = min(index)
            except:
                index_smallest = np.nan

            topics_per_document = {'documentnr' : documentnr, 
                                    'stemmed_topic_label_dictionary': topic, 
                                    'stemmed_index_words' : index, 
                                    'stemmed_smallest_index' : index_smallest,
                                    'stemmed_len matches' : len(match),
                                    'stemmed_words matches' : match  ,
                                    'stemmed_text' : document.lower()}
            result.append(topics_per_document)
    return result

def get_merged_df():
    '''returns a df with number of topics as identified by the dictionary approach'''
    
    result = dictionary_topics()
    stemmed_results = dictionary_topics_stemmed()
    df2 = pd.DataFrame.from_dict(result)
    df2 = (df2.assign(to_sort = df2.smallest_index.abs()).sort_values('to_sort').drop_duplicates('documentnr').drop(columns='to_sort'))
    df2 = df2[np.isfinite(df2['smallest_index'])]
    df3 = pd.DataFrame.from_dict(stemmed_results)
    df3 = (df3.assign(to_sort = df3.stemmed_smallest_index.abs()).sort_values('to_sort').drop_duplicates('documentnr').drop(columns='to_sort'))
    df3 = df3[np.isfinite(df3['stemmed_smallest_index'])]
    df1 = get_data()
    df = pd.merge(df1, df2, how= 'left', on = 'documentnr')
    df = pd.merge(df, df3, how = 'left', on='documentnr')
    df['topic_label_dictionary'].fillna(value='Overige', inplace = True)
    df['len matches'] = df['len matches'].fillna(0)
    df['stemmed_topic_label_dictionary'].fillna(value='Overige', inplace = True)
    df['stemmed_len matches'] = df['stemmed_len matches'].fillna(0)
    return df

def recode_dictionary():
    '''recode categories so to match Bjorns' scoring'''
    
    df = get_merged_df()
    a = ['Buitenlandse handel' , 'Kunst, cultuur en entertainment' ,'*** Sport', 'Ruimtelijke ordening, publiek natuur- en waterbeheer', 'Toegevoegde codes voor media'] 
    b = ['Overige' ] * len(a)
    overige_cat = dict(zip(a,b))
    
    df['main_topic_label'].replace(overige_cat, inplace = True)
    df['topic_label_dictionary'].replace(overige_cat, inplace = True)
    df['stemmed_topic_label_dictionary'].replace(overige_cat, inplace = True)
    
    logger.info("the length of categories identified by dict is now: {} ".format(len(df['topic_label_dictionary'].unique()) ) )
    logger.info("...and the stemmed dict: {} ".format(len(df['stemmed_topic_label_dictionary'].unique()) ) )
    return df


def apply_minnummatches():
    
    ''' specify how many words should match before the topic is considered present'''
    df = recode_dictionary()
    df['topic_label_dictionary_minmatches'] = np.where(df['len matches'] < MINNUMBERMATCHES, 'Overige', df['topic_label_dictionary'])
    df['topic_label_dictionary_minmatches_stem'] = np.where(df['stemmed_len matches'] < MINNUMBERMATCHES, 'Overige', df['stemmed_topic_label_dictionary'])
    return df

def get_tp_fp_fn():
    
    '''create columns with true postives, false positives, and false negatives'''
    
    df = apply_minnummatches()
    topics = list(df['main_topic_label'].unique())
    
    for topic in topics:
        columnname_tp = "_tp " + str(topic)
        columnname_fp = "_fp " + str(topic)
        columnname_fn = "_fn " + str(topic)
        
        # and for stemmed
        
        columnname_tp_st = "st_tp " + str(topic)
        columnname_fp_st = "st_fp " + str(topic)
        columnname_fn_st = "st_fn " + str(topic)
        
        # true positives = dictionary correctly identified.
        df[columnname_tp] = np.where( (df['main_topic_label'] == topic) & (df['topic_label_dictionary_minmatches'] == topic) , 1, 0 )
        # false positive = dictionary identified, but golden standard not. 
        df[columnname_fp] = np.where( (df['main_topic_label'] != topic) & (df['topic_label_dictionary_minmatches'] == topic) , 1, 0 )
        # false negative = dictionary NOT identified, but golden standard DID identify 
        df[columnname_fn] = np.where( (df['main_topic_label'] == topic) & (df['topic_label_dictionary_minmatches'] != topic) , 1, 0 )
        
        # and for stemmed:   
        df[columnname_tp_st] = np.where( (df['main_topic_label'] == topic) & (df['topic_label_dictionary_minmatches_stem'] == topic) , 1, 0 )
        df[columnname_fp_st] = np.where( (df['main_topic_label'] != topic) & (df['topic_label_dictionary_minmatches_stem'] == topic) , 1, 0 )
        df[columnname_fn_st] = np.where( (df['main_topic_label'] == topic) & (df['topic_label_dictionary_minmatches_stem'] != topic) , 1, 0 )
 
    return df

In [959]:
# describe
#df['n'] = 1
#df.groupby(df.main_topic_label).agg({'n' :'sum'})

#fig = plt.figure(figsize=(8,6))
#df.groupby('topic_label_dictionary_minmatches').text_x.count().sort_values().plot.barh(ylim=0, title= 'N ANNOTATIONS PER CATEGORY\n')
#plt.show()

In [11]:
def get_recall_precision():
    topics = list(df['main_topic_label'].unique())
    
    true_positives = ["_tp " + str(i) for i in topics]
    false_positives = ["_fp " + str(i) for i in topics]
    false_negatives = ["_fn " + str(i) for i in topics]
    
    true_positives_st = ["st_tp " + str(i) for i in topics]
    false_positives_st = ["st_fp " + str(i) for i in topics]
    false_negatives_st = ["st_fn " + str(i) for i in topics]
   
    recall = {}
    precision = {}
    
    recall_stemmed = {}
    precision_stemmed = {}
    
    for tp, fp, fn, st_tp, st_fp, st_fn, topic in zip(true_positives, false_positives, false_negatives, true_positives_st, false_positives_st, false_negatives_st, topics) :   
        
        recall['recall ' + str(topic)] = df[tp].sum(axis=0) / ( df[tp].sum(axis=0) + df[fn].sum(axis=0) )
        precision['precision ' + str(topic)] = df[tp].sum(axis=0) / ( df[tp].sum(axis=0) + df[fp].sum(axis=0) )
        
        recall_stemmed['recall ' + str(topic)] = df[st_tp].sum(axis=0) / ( df[st_tp].sum(axis=0) + df[st_fn].sum(axis=0) )
        precision_stemmed['precision ' + str(topic)] = df[st_tp].sum(axis=0) / ( df[st_tp].sum(axis=0) + df[st_fp].sum(axis=0) )

    recall['recall total'] = sum(recall.values()) / len(recall.values())
    precision['precision total'] = sum(precision.values()) / len(precision.values())
    recall_stemmed['recall total'] = sum(recall_stemmed.values()) / len(recall_stemmed.values())
    precision_stemmed['precision total'] = sum(precision_stemmed.values()) / len(precision_stemmed.values())
    
    return recall, precision, recall_stemmed, precision_stemmed

In [None]:
# get final df:
df = get_tp_fp_fn()
# check whether all went ok
df[df['main_topic_label'] == 'Overige'][['main_topic_label', 'topic_label_dictionary_minmatches', '_tp Overige', '_fp Overige', '_fn Overige']].head(10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [385]:
df.to_pickle(PATH_TO_DATA + 'all_data.pkl')

In [424]:
df['main_topic_label'].astype(str).replace({None: 'Overige'}, inplace=True)
df[df['main_topic_label'].astype(str) == None]

Unnamed: 0,documentnr,main_topic_label,origin,text_x,topic,type,topic_label_dictionary,index_words,smallest_index,len matches,...,"_fn Wetenschappelijk onderzoek, technologie en communicatie","st_tp Wetenschappelijk onderzoek, technologie en communicatie","st_fp Wetenschappelijk onderzoek, technologie en communicatie","st_fn Wetenschappelijk onderzoek, technologie en communicatie",_tp None,_fp None,_fn None,st_tp None,st_fp None,st_fn None


In [398]:
#get precision and recall for dictionary items
recall, precision, recall_stemmed, precision_stemmed = get_recall_precision()



In [399]:
for t, s in recall.items():
    print('{} -- {}'.format(t, s))
    
print()

for t, s in precision.items():
    print('{} -- {}'.format(t, s))

print()

    
for t, s in recall_stemmed.items():
    print('{} -- {}'.format(t, s))
    
print()

for t, s in precision_stemmed.items():
    print('{} -- {}'.format(t, s))

recall Onderwijs -- 0.41901408450704225
recall Burgerrechten en vrijheden -- 0.017057569296375266
recall Justitie, Rechtspraak, Criminaliteit -- 0.17144808743169399
recall Defensie -- 0.19556451612903225
recall Gezondheid -- 0.2334293948126801
recall Gemeenschapsontwikkeling, huisvestingsbeleid en stedelijke planning -- 0.060109289617486336
recall Functioneren democratie en openbaar bestuur -- 0.02122347066167291
recall Macro-economie en belastingen -- 0.15767634854771784
recall Buitenlandse zaken en ontwikkelingssamenwerking -- 0.032357473035439135
recall Ondernemingen, Bankwezen en binnenlandse handel  -- 0.1109375
recall Arbeid -- 0.1038961038961039
recall Verkeer en vervoer -- 0.10266159695817491
recall Overige -- 0.9448740828871703
recall sociale Zaken -- 0.01
recall Immigratie en integratie -- 0.10038610038610038
recall Landbouw en Visserij -- 0.345
recall Energiebeleid -- 0.1223021582733813
recall Milieu -- 0.07936507936507936
recall Wetenschappelijk onderzoek, technologie en co

In [None]:
# visualize 

In [1014]:
len(df['topic_label_dictionary_minmatches'].unique())

19

In [349]:
df['id'] = df.index
melted = pd.melt(df, id_vars=['id'], value_vars=['main_topic_label', 'topic_label_dictionary_minmatches','topic_label_dictionary_minmatches_stem'], var_name='manual vs. dictionary', value_name='topic')

plt.rcParams['figure.figsize']=(10,10)
ax = sns.countplot(y="topic", hue='manual vs. dictionary', 
                   order = melted['topic'].value_counts().index, data=melted)

KeyError: "The following 'value_vars' are not present in the DataFrame: ['topic_label_dictionary_minmatches', 'topic_label_dictionary_minmatches_stem']"

In [327]:
pl['id_number'] = pl['id_number'].astype(str)
coder['id_number'] = coder['id_number'].astype(str)

In [328]:
coder['year'] = coder['year'].fillna(0).astype(int)
coder['year'] = coder['year'].astype(str)
pl['year'] = pl['year'].astype(str)

In [331]:
df = pd.merge(coder, pl, how= 'inner', on = ['year', 'id_number'])

In [332]:
#coder[['year', 'id_number']]

In [334]:
len(df)

1750

In [335]:
coder['id_number']

0       1306.0
1       1391.0
2       1453.0
3        285.0
4        798.0
         ...  
3228    1695.0
3229     384.0
3230    1543.0
3231    1599.0
3232     825.0
Name: id_number, Length: 1917, dtype: object

In [74]:
pl[pl['counter'] == 'ah-tk-19971998-46.xml']

Unnamed: 0,counter,file,type,datum,naam,onderwerp,vraag,date,id_number,id_date,year


In [336]:
#pl[pl['year'] == 1996]

#pl[(pl['year'] == 1996) & (pl['id_number'] == '993')]

In [229]:
df[['year', 'doc_number', 'id_number']]

Unnamed: 0,year,doc_number,id_number
0,1996,1306,1306
1,1996,1391,1391
2,1996,1453,1453
3,1995,285,285
4,1996,798,798
5,1997,1624,1624
6,1997,1696,1696
7,1996,588,588
8,1996,588,588
9,1997,669,669


In [75]:
len(pl)
57892

28058

In [954]:
# print dataframe, transformed: grouped by document and only the max length matches
#idx = df2.groupby(['documentnr'], sort=False)['len matches'].transform(max) == df2['len matches']
#df2[idx].head()