### Default Imports

In [16]:
import spacy
import json
import pandas as pd
import hunspell
from tabulate import tabulate
import os
import pickle


def tprint(df: pd.DataFrame, head=0):
    if head > 0:
        df = df.head(head)
    elif head < 0:
        df = df.tail(-head)
    print(tabulate(df, headers="keys", tablefmt="pipe") + '\n')

In [None]:
nlp = spacy.load('de')  # <-- load with dependency parser (slower)
# nlp = spacy.load('de', disable=['parser'])

from iwnlp.iwnlp_wrapper import IWNLPWrapper
lemmatizer = IWNLPWrapper(lemmatizer_path='./IWNLP.Lemmatizer_20170501.json')

In [None]:
# --- function definitions ---


def process_phrases(doc_):
    """ 
        given a doc process and return the contained noun phrases.
        This function is based on spacy's noun chunk detection. 
        It also creates items for a global phrase lookup table, which are currently not used.
    """

    # clean the noun chuncs from spacy first
    noun_chunks = []
    for chunk in doc_.noun_chunks:
        start = False
        noun_chunk = []
        for token in chunk:
            # exclude punctuation
            if token.pos_ == 'PUNCT':
                continue
            # exclude leading determiners
            if not start and (token.pos_ == 'DET' or token.is_stop):
                continue
            start = True
            noun_chunk.append(token)
        if len(noun_chunk) > 1:
            noun_chunks.append(noun_chunk)
    
    # the remaining, adjusted noun chunks will be lemmatized and indexed
    phrase_list_global = []
    phrase_list_doc = []
    for chunk in noun_chunks:
        phrase = []
        for token in chunk:
            lemma, _ = lemmatize(token.text, token.pos_)
            if lemma:
                phrase.append(lemma)
            else:
                phrase.append(token.text)
        phrase = ' '.join(phrase)
        text = ' '.join([t.text for t in chunk])
        
        # add to phrase collection of corpus
        phrase_series = pd.Series()
        phrase_series['Lemmatized Phrase'] = phrase
        phrase_series['Original Phrase'] = text
        phrase_series['Spacy Tokens'] = tuple(chunk)
        phrase_list_global.append(phrase_series)
        
        # add to document dataframe
        phrase_series = pd.Series()
        phrase_series['Text'] = text
        phrase_series['IWNLP'] = phrase
        phrase_series['POS'] = 'PHRASE' 
        phrase_series['Index'] = chunk[0].i
        phrase_series['Start'] = chunk[0].idx
        phrase_list_doc.append(phrase_series)

    # TODO: add globally pd.DataFrame(phrase_list_global) 
    # return the dataframes for the global phrase index and for the doc dataframe
    return pd.DataFrame(phrase_list_doc)


def lemmatize(token: str, pos: str) -> (str, bool):
    """ 
    This function uses the IWNLP lemmatizer with a few enhancements for composite nouns and nouns 
    with uncommon capitalization. Can also be used to lemmatize tokens with different POS-tags.
    Do not use this function to lemmatize phrases.
    :param token: white space stripped single token (str)
    :param pos:   string constant, one of Universal tagset.
    :return: tuple of type (str, bool)
           value[0]: The lemma of the token if a lemma can be derived, else None.
           value[1]: True if the token can be retrieved from the Wiktionary database as is, else False.
    """
    
    if pos == 'PHRASE':
        try:
            raise ValueError
        except ValueError:
            print("Don't lemmatize Phrases with this function!")
    
    lemm = lemmatizer.lemmatize(token, pos)
    # default lemmatization ok?
    if lemm:
        return lemm[0], True

    # some rules to derive a lemma from the original token (nouns only)
    # TODO: define rules for hyphenated nouns
    if pos == 'NOUN' or pos == 'PROPN':
        # first try default noun capitalization
        lemm = lemmatizer.lemmatize(token.title(), pos)
        if lemm:
            return lemm[0], False

    # still no results: try noun suffixes
        for i in range(1, len(token)-1):
            token_edit = token[i:].title()
            lemm = lemmatizer.lemmatize_plain(token_edit, ignore_case=True)
            if lemm:
                lemm = lemm[0]
                lemm = token[:i].title() + lemm.lower()
                return lemm, False
    
    # sorry, no results found:
    return None, False


def df_from_doc(doc_):
    """
    Creates a pandas DataFrame from a given spacy.doc that contains only nouns and noun phrases.
    :param doc_: spacy.doc 
    :return:     pandas.DataFrame
    """

    tags = []
    
    for token in doc_:
        tags.append((token.text, token.lemma_, token.pos_, token.tag_, token.is_stop,
                     token.i, token.idx,
                     token.ent_type_, token.ent_iob_, 
                     # token.ent_id_,
                     ))
    
    df = pd.DataFrame(tags)
    df = df.rename(columns={
                            0: "Text", 1: "Lemma", 2: "POS", 3: "Tag", 4: "Stop", 
                            5: "Index", 6: "Start",
                            7: "Ent_type", 8: "Ent_iob", 
                            # 8: "Ent_id"  # currently not used :(
                            # 4: "Dep", 5: "Shape", 6: "alpha", 
                          })
    
    # add IWNLP lemmatization
    df['IWNLP'], df['Known'] = zip(*df.apply(lambda row: lemmatize(row['Text'], row['POS']), axis=1))
    
    # add phrases
    df_phrases = process_phrases(doc)
    df = df.append(df_phrases).sort_values('Start').reindex()
    
    return df[df.POS.isin(['NOUN', 'PROPN', 'PHRASE'])]


In [None]:
def process_doc(doc):
    doc_df = df_from_doc(doc)
    # tprint(doc_df)
    series = doc_df.apply(lambda r: r['IWNLP'] if r['IWNLP'] else r['Text'], axis=1)
    return list(series)

In [15]:
### --- preprocess corpora data to a standard format ---


data_base = "../../master_cloud/corpora"

# standard meta data fields
DATASET = 'dataset'
SUBSET = 'subset'
ID = 'doc_id'
ID2 = 'doc_subid'
TITLE = 'doc_title'
CAT = 'doc_category'


def participation_data():
    """yields: data set name, data subset name, data json"""
    
    dataset = "OnlineParticipation"
    print("process", dataset)
    
    def process_subset(orig_data, subset):
        """
        :param orig_data: list of dictionaries in original key/value format 
        :return: list of dictionaries in standard key/value format
        """
        category_lookup = {}
        print('process', subset)
        
        meta_data = []
        text_data = []
        for d in orig_data:
            nd = dict()
            nd[ID] = d['suggestion_id']
            print(subset, nd[ID])
            nd[DATASET] = dataset
            nd[SUBSET] = subset
            nd[TITLE] = d['title']

            # wuppertal has a different data scheme
            if subset == 'wuppertal2017':
                wupp = True
                if 'tags' in d:
                    nd[CAT] = tuple(d['tags'])
                    category_lookup[nd[ID]] = nd[CAT]
                else:
                    nd[CAT] = category_lookup[nd[ID]]
                nd[ID2] = 0
                text = d['title'] + ' .\n' \
                    + d['content'] + ' .\n' \
                    + d['Voraussichtliche Rolle für die Stadt Wuppertal'] + ' .\n' \
                    + d['Mehrwert der Idee für Wuppertal'] + ' .\n'
                    # + d['Eigene Rolle bei der Projektidee'] + ' .\n'
                    # + d['Geschätzte Umsetzungsdauer und Startschuss'] + ' .\n'
                    # + d['Kostenschätzung der Ideeneinreicher'] + ' .\n'

            else:
                if 'category' in d:
                    nd[CAT] = d['category']
                    category_lookup[nd[ID]] = nd[CAT]
                else:
                    nd[CAT] = category_lookup[nd[ID]]
                nd[ID2] = d['comment_id'] if ('comment_id' in d) else 0
                # ignore if no content
                if not d['content']:
                    continue
                text = d['title'] + ' .\n' + d['content'] if d['title'] else d['content']

            metahash = hash((nd[DATASET], nd[SUBSET], nd[ID], nd[ID2], nd[TITLE]))
            meta = (metahash, nd)
            meta_data.append(meta)
            text_data.append((metahash, text))
            
        return meta_data, text_data
    
    # --- open files ---
    local_path = "OnlineParticipationDatasets/downloads"
    full_path = os.path.join(data_base, local_path)

    for root, dirs, files in os.walk(full_path, topdown=False):
        for name in files:
            if name[-9:-5] == 'flat':
                fpath = os.path.join(root, name)
                try: 
                    with open(fpath, 'r') as fp:
                        print('open:', fpath)
                        data = json.load(fp)
                        if not data:
                            continue
                except IOError:
                    print("Could not open", fpath)
                    continue
                subset = name[6:-10]
                
                yield process_subset(data, subset)


meta_data = []
text_data = []
for item in participation_data():
    meta_data.append(item[0])
    text_data.append(item[1])
    
    
# data set, data subset, doc id, doc title, doc category


process OnlineParticipation
open: ../../master_cloud/corpora/OnlineParticipationDatasets/downloads/items_bonn2017_flat.json
process bonn2017
bonn2017 985
bonn2017 988
bonn2017 986
bonn2017 987
bonn2017 983
bonn2017 979
bonn2017 980
bonn2017 982
bonn2017 984
bonn2017 990
bonn2017 941
bonn2017 948
bonn2017 948
bonn2017 942
bonn2017 942
bonn2017 957
bonn2017 940
bonn2017 949
bonn2017 975
bonn2017 959
bonn2017 959
bonn2017 978
bonn2017 977
bonn2017 977
bonn2017 921
bonn2017 921
bonn2017 925
bonn2017 925
bonn2017 925
bonn2017 923
bonn2017 923
bonn2017 923
bonn2017 930
bonn2017 930
bonn2017 930
bonn2017 930
bonn2017 922
bonn2017 922
bonn2017 928
bonn2017 928
bonn2017 928
bonn2017 928
bonn2017 928
bonn2017 929
bonn2017 929
bonn2017 929
bonn2017 929
bonn2017 929
bonn2017 929
bonn2017 929
bonn2017 929
bonn2017 929
bonn2017 929
bonn2017 929
bonn2017 929
bonn2017 933
bonn2017 932
bonn2017 934
bonn2017 934
bonn2017 934
bonn2017 913
bonn2017 909
bonn2017 909
bonn2017 909
bonn2017 909
bonn2017 909
b

badgodesberg 2697
badgodesberg 2697
badgodesberg 2702
badgodesberg 2702
badgodesberg 2703
badgodesberg 2704
badgodesberg 2699
badgodesberg 2699
badgodesberg 2695
badgodesberg 2691
badgodesberg 2687
badgodesberg 2687
badgodesberg 2689
badgodesberg 2690
badgodesberg 2690
badgodesberg 2694
badgodesberg 2692
badgodesberg 2693
badgodesberg 2686
badgodesberg 2688
badgodesberg 2685
badgodesberg 2685
badgodesberg 2685
badgodesberg 2685
badgodesberg 2685
badgodesberg 2685
badgodesberg 2685
badgodesberg 2681
badgodesberg 2680
badgodesberg 2678
badgodesberg 2676
badgodesberg 2677
badgodesberg 2682
badgodesberg 2683
badgodesberg 2683
badgodesberg 2683
badgodesberg 2684
badgodesberg 2679
badgodesberg 2679
badgodesberg 2675
badgodesberg 2670
badgodesberg 2670
badgodesberg 2670
badgodesberg 2666
badgodesberg 2666
badgodesberg 2668
badgodesberg 2668
badgodesberg 2665
badgodesberg 2665
badgodesberg 2669
badgodesberg 2673
badgodesberg 2673
badgodesberg 2673
badgodesberg 2672
badgodesberg 2671
badgodesbe

bonn2019 2954
bonn2019 2954
bonn2019 2954
bonn2019 2954
bonn2019 2966
bonn2019 2966
bonn2019 2966
bonn2019 2966
bonn2019 2966
bonn2019 2965
bonn2019 2965
bonn2019 2965
bonn2019 2965
bonn2019 2965
bonn2019 2965
bonn2019 2965
bonn2019 2965
bonn2019 2965
bonn2019 2950
bonn2019 2950
bonn2019 2950
bonn2019 2950
bonn2019 2950
bonn2019 2950
bonn2019 2950
bonn2019 2950
bonn2019 2950
bonn2019 2950
bonn2019 2950
bonn2019 2952
bonn2019 2952
bonn2019 2951
bonn2019 2951
bonn2019 2951
bonn2019 2951
bonn2019 2972
bonn2019 2972
bonn2019 2972
bonn2019 2969
bonn2019 2969
bonn2019 2969
bonn2019 2969
bonn2019 2969
bonn2019 2969
bonn2019 2969
bonn2019 2969
bonn2019 2969
bonn2019 2969
bonn2019 2969
bonn2019 2937
bonn2019 2937
bonn2019 2937
bonn2019 2937
bonn2019 2943
bonn2019 2943
bonn2019 2943
bonn2019 2943
bonn2019 2943
bonn2019 2943
bonn2019 2943
bonn2019 2943
bonn2019 2943
bonn2019 2943
open: ../../master_cloud/corpora/OnlineParticipationDatasets/downloads/items_bonn2011_flat.json
open: ../../master_clo

koeln2012 541
koeln2012 543
koeln2012 543
koeln2012 543
koeln2012 572
koeln2012 540
koeln2012 540
koeln2012 540
koeln2012 540
koeln2012 540
koeln2012 540
koeln2012 540
koeln2012 540
koeln2012 540
koeln2012 540
koeln2012 540
koeln2012 545
koeln2012 545
koeln2012 545
koeln2012 547
koeln2012 546
koeln2012 546
koeln2012 546
koeln2012 546
koeln2012 546
koeln2012 548
koeln2012 550
koeln2012 550
koeln2012 532
koeln2012 532
koeln2012 532
koeln2012 549
koeln2012 549
koeln2012 551
koeln2012 551
koeln2012 551
koeln2012 530
koeln2012 530
koeln2012 531
koeln2012 531
koeln2012 533
koeln2012 534
koeln2012 536
koeln2012 536
koeln2012 535
koeln2012 535
koeln2012 537
koeln2012 538
koeln2012 538
koeln2012 539
koeln2012 539
koeln2012 539
koeln2012 539
koeln2012 519
koeln2012 519
koeln2012 521
koeln2012 521
koeln2012 521
koeln2012 521
koeln2012 524
koeln2012 523
koeln2012 522
koeln2012 525
koeln2012 525
koeln2012 526
koeln2012 527
koeln2012 527
koeln2012 527
koeln2012 527
koeln2012 527
koeln2012 527
koeln2

koeln2012 204
koeln2012 186
koeln2012 186
koeln2012 186
koeln2012 186
koeln2012 186
koeln2012 186
koeln2012 187
koeln2012 188
koeln2012 188
koeln2012 188
koeln2012 189
koeln2012 189
koeln2012 189
koeln2012 189
koeln2012 189
koeln2012 189
koeln2012 190
koeln2012 190
koeln2012 190
koeln2012 190
koeln2012 190
koeln2012 191
koeln2012 193
koeln2012 193
koeln2012 193
koeln2012 193
koeln2012 193
koeln2012 192
koeln2012 192
koeln2012 194
koeln2012 194
koeln2012 195
koeln2012 195
koeln2012 195
koeln2012 206
koeln2012 206
koeln2012 206
koeln2012 206
koeln2012 206
koeln2012 206
koeln2012 206
koeln2012 206
koeln2012 206
koeln2012 176
koeln2012 176
koeln2012 176
koeln2012 176
koeln2012 176
koeln2012 176
koeln2012 176
koeln2012 176
koeln2012 176
koeln2012 176
koeln2012 176
koeln2012 178
koeln2012 178
koeln2012 178
koeln2012 177
koeln2012 177
koeln2012 177
koeln2012 177
koeln2012 177
koeln2012 177
koeln2012 180
koeln2012 180
koeln2012 180
koeln2012 180
koeln2012 179
koeln2012 179
koeln2012 179
koeln2

koeln2016 829
koeln2016 829
koeln2016 143
koeln2016 143
koeln2016 143
koeln2016 143
koeln2016 143
koeln2016 754
koeln2016 754
koeln2016 754
koeln2016 754
koeln2016 754
koeln2016 754
koeln2016 754
koeln2016 618
koeln2016 618
koeln2016 618
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 155
koeln2016 834
koeln2016 190
koeln2016 190
koeln2016 190
koeln2016 190
koeln2016 190
koeln2016 167
koeln2016 167
koeln2016 570
koeln2016 570
koeln2016 570
koeln2016 570
koeln2016 570
koeln2016 570
koeln2016 570
koeln2016 570
koeln2016 570
koeln2016 570
koeln2016 355
koeln2016 18
koeln2016 18
koeln2016 18
koeln2016 18
koeln2016 706
koeln2016 706
koeln2016 706
koeln2016 706
koeln2016 706
koeln2016 706
koeln2016 833
koeln2016 635
koeln2016 635
koeln2016 635
koeln2016 635
koeln2016 635
koeln2016 140
koeln2016 140
koeln2016 140
koeln2016 140
koeln2016 651
koeln2016 

koeln2016 93
koeln2016 175
koeln2016 175
koeln2016 614
koeln2016 132
koeln2016 759
koeln2016 759
koeln2016 759
koeln2016 397
koeln2016 397
koeln2016 397
koeln2016 397
koeln2016 215
koeln2016 215
koeln2016 168
koeln2016 327
koeln2016 327
koeln2016 327
koeln2016 327
koeln2016 551
koeln2016 219
koeln2016 309
koeln2016 172
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 91
koeln2016 745
koeln2016 510
koeln2016 732
koeln2016 732
koeln2016 839
koeln2016 24
koeln2016 509
koeln2016 154
koeln2016 154
koeln2016 254
koeln2016 254
koeln2016 403
koeln2016 403
koeln2016 403
koeln2016 426
koeln2016 426
koeln2016 5
koeln2016 572
koeln2016 488
koeln2016 488
koeln2016 488
koeln2016 488
koeln2016 488
koeln2016 488
koeln2016 57
koeln2016 542
koeln2016 700
koeln2016 305
koeln2016 305
koeln2016 82
koeln2016 82
koeln2016 82
koeln2016 82
koeln2016 82
koeln2016 549
koe

bonn2015 539
bonn2015 539
bonn2015 539
bonn2015 539
bonn2015 539
bonn2015 539
bonn2015 539
bonn2015 539
bonn2015 539
bonn2015 539
bonn2015 539
bonn2015 539
bonn2015 538
bonn2015 538
bonn2015 538
bonn2015 538
bonn2015 514
bonn2015 514
bonn2015 514
bonn2015 514
bonn2015 514
bonn2015 514
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 512
bonn2015 516
bonn2015 516
bonn2015 516
bonn2015 516
bonn2015 516
bonn2015 517
bonn2015 517
bonn2015 517
bonn2015 517
bonn2015 517
bonn2015 520
bonn2015 522
bonn2015 522
bonn2015 522
bonn2015 526
bonn2015 526
bonn2015 526
bonn2015 526
bonn2015 526
bonn2015 523
bonn2015 523
bonn2015 523
bonn2015 524
bonn2015 524
bonn2015 524
bonn2015 524
bonn2015 525
bonn2015 525
bonn2015 525
bonn2015 525
bonn2015 525
bonn2015 525
bonn2015 525
bonn2015 525
bonn2015 525
bonn2015 525
bonn2015 525

bonn2015 354
bonn2015 354
bonn2015 354
bonn2015 354
bonn2015 354
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 355
bonn2015 361
bonn2015 361
bonn2015 361
bonn2015 361
bonn2015 361
bonn2015 361
bonn2015 361
bonn2015 361
bonn2015 361
bonn2015 361
bonn2015 361
bonn2015 361
bonn2015 341
bonn2015 341
bonn2015 341
bonn2015 341
bonn2015 341
bonn2015 341
bonn2015 341
bonn2015 347
bonn2015 347
bonn2015 347
bonn2015 347
bonn2015 346
bonn2015 346
bonn2015 346
bonn2015 346
bonn2015 346
bonn2015 346
bonn2015 346
bonn2015 346
bonn2015 346
bonn2015 346
bonn2015 346
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 348
bonn2015 344
bonn2015 344
bonn2015 344
bonn2015 344
bonn2015 344
bonn2015 345
bonn2015 345

bonn2015 214
bonn2015 214
bonn2015 214
bonn2015 214
bonn2015 214
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 218
bonn2015 217
bonn2015 217
bonn2015 217
bonn2015 217
bonn2015 217
bonn2015 217
bonn2015 217
bonn2015 217
bonn2015 217
bonn2015 217
bonn2015 217
bonn2015 217

maengelmelder-braunschweig 3967
maengelmelder-braunschweig 3966
maengelmelder-braunschweig 3965
maengelmelder-braunschweig 3964
maengelmelder-braunschweig 3963
maengelmelder-braunschweig 3962
maengelmelder-braunschweig 3961
maengelmelder-braunschweig 3960
maengelmelder-braunschweig 3959
maengelmelder-braunschweig 3958
maengelmelder-braunschweig 3957
maengelmelder-braunschweig 3956
maengelmelder-braunschweig 3955
maengelmelder-braunschweig 3954
maengelmelder-braunschweig 3953
maengelmelder-braunschweig 3952
maengelmelder-braunschweig 3951
maengelmelder-braunschweig 3950
maengelmelder-braunschweig 3949
maengelmelder-braunschweig 3948
maengelmelder-braunschweig 3947
maengelmelder-braunschweig 3945
maengelmelder-braunschweig 3944
maengelmelder-braunschweig 3943
maengelmelder-braunschweig 3942
maengelmelder-braunschweig 3941
maengelmelder-braunschweig 3940
maengelmelder-braunschweig 3939
maengelmelder-braunschweig 3938
maengelmelder-braunschweig 3937
maengelmelder-braunschweig 3936
maengelm

maengelmelder-braunschweig 2618
maengelmelder-braunschweig 2617
maengelmelder-braunschweig 2616
maengelmelder-braunschweig 2615
maengelmelder-braunschweig 2614
maengelmelder-braunschweig 2613
maengelmelder-braunschweig 2612
maengelmelder-braunschweig 2611
maengelmelder-braunschweig 2610
maengelmelder-braunschweig 2609
maengelmelder-braunschweig 2608
maengelmelder-braunschweig 2607
maengelmelder-braunschweig 2606
maengelmelder-braunschweig 2605
maengelmelder-braunschweig 2604
maengelmelder-braunschweig 2603
maengelmelder-braunschweig 2602
maengelmelder-braunschweig 2601
maengelmelder-braunschweig 2600
maengelmelder-braunschweig 2599
maengelmelder-braunschweig 2598
maengelmelder-braunschweig 2596
maengelmelder-braunschweig 2595
maengelmelder-braunschweig 2594
maengelmelder-braunschweig 2593
maengelmelder-braunschweig 2592
maengelmelder-braunschweig 2591
maengelmelder-braunschweig 2590
maengelmelder-braunschweig 2589
maengelmelder-braunschweig 2588
maengelmelder-braunschweig 2587
maengelm

koeln2013 B-106
koeln2013 B-106
koeln2013 B-106
koeln2013 B-106
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-116
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-142
koeln2013 B-47
koeln2013 B-47
koeln2013 B-47
koeln2013 B-47
koeln2013 B-47
koeln2013 B-47
koeln2013 B-47
koeln2013 B-47
koeln2013 B-47
koeln2013 B-47
ko

koeln2013 B-156
koeln2013 B-156
koeln2013 B-592
koeln2013 B-592
koeln2013 B-592
koeln2013 B-172
koeln2013 B-172
koeln2013 B-172
koeln2013 B-172
koeln2013 B-172
koeln2013 B-172
koeln2013 B-172
koeln2013 B-543
koeln2013 B-543
koeln2013 B-543
koeln2013 B-543
koeln2013 B-543
koeln2013 B-543
koeln2013 B-543
koeln2013 B-538
koeln2013 B-538
koeln2013 B-538
koeln2013 B-542
koeln2013 B-125
koeln2013 B-125
koeln2013 B-125
koeln2013 B-300
koeln2013 B-300
koeln2013 B-300
koeln2013 B-300
koeln2013 B-300
koeln2013 B-300
koeln2013 B-300
koeln2013 B-300
koeln2013 B-300
koeln2013 B-367
koeln2013 B-367
koeln2013 B-367
koeln2013 B-367
koeln2013 B-367
koeln2013 B-367
koeln2013 B-635
koeln2013 B-635
koeln2013 B-635
koeln2013 B-635
koeln2013 B-232
koeln2013 B-232
koeln2013 B-232
koeln2013 B-232
koeln2013 B-232
koeln2013 B-232
koeln2013 B-389
koeln2013 B-389
koeln2013 B-389
koeln2013 B-642
koeln2013 B-642
koeln2013 B-591
koeln2013 B-591
koeln2013 B-285
koeln2013 B-285
koeln2013 B-285
koeln2013 B-285
koeln201

koeln2013 B-97
koeln2013 B-97
koeln2013 B-97
koeln2013 B-97
koeln2013 B-97
koeln2013 B-97
koeln2013 B-29
koeln2013 B-29
koeln2013 B-29
koeln2013 B-202
koeln2013 B-202
koeln2013 B-202
koeln2013 B-211
koeln2013 B-211
koeln2013 B-211
koeln2013 B-557
koeln2013 B-558
koeln2013 B-558
koeln2013 B-253
koeln2013 B-253
koeln2013 B-253
koeln2013 B-253
koeln2013 B-253
koeln2013 B-253
koeln2013 B-253
koeln2013 B-253
koeln2013 B-253
koeln2013 B-610
koeln2013 B-610
koeln2013 B-610
koeln2013 B-117
koeln2013 B-117
koeln2013 B-117
koeln2013 B-117
koeln2013 B-117
koeln2013 B-117
koeln2013 B-220
koeln2013 B-220
koeln2013 B-220
koeln2013 B-220
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-89
koeln2013 B-76
koeln2013 B-76
koeln2013 B-76
koeln2013 B-76
koeln2013 B-76
koeln2013 B-76
koeln2013 B-76
koeln2013 B-76
koeln2013 B-76
koeln2013 B-76
koeln2013 B-76
koeln2013

koeln2015 145
koeln2015 145
koeln2015 145
koeln2015 252
koeln2015 252
koeln2015 252
koeln2015 252
koeln2015 252
koeln2015 252
koeln2015 252
koeln2015 252
koeln2015 79
koeln2015 79
koeln2015 79
koeln2015 79
koeln2015 79
koeln2015 79
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 177
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 29
koeln2015 200
koeln2015 200
koeln2015 200
koeln2015 200
koeln2015 200
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
koeln2015 275
ko

koeln2015 492
koeln2015 551
koeln2015 551
koeln2015 551
koeln2015 551
koeln2015 602
koeln2015 602
koeln2015 602
koeln2015 436
koeln2015 506
koeln2015 170
koeln2015 170
koeln2015 170
koeln2015 127
koeln2015 560
koeln2015 563
koeln2015 193
koeln2015 346
koeln2015 650
koeln2015 241
koeln2015 241
koeln2015 241
koeln2015 241
koeln2015 479
koeln2015 479
koeln2015 604
koeln2015 604
koeln2015 476
koeln2015 150
koeln2015 434
koeln2015 434
koeln2015 274
koeln2015 274
koeln2015 274
koeln2015 240
koeln2015 240
koeln2015 38
koeln2015 505
koeln2015 505
koeln2015 505
koeln2015 505
koeln2015 505
koeln2015 505
koeln2015 505
koeln2015 505
koeln2015 505
koeln2015 505
koeln2015 505
koeln2015 543
koeln2015 543
koeln2015 569
koeln2015 157
koeln2015 389
koeln2015 541
koeln2015 144
koeln2015 356
koeln2015 356
koeln2015 580
koeln2015 580
koeln2015 361
koeln2015 455
koeln2015 455
koeln2015 459
koeln2015 459
koeln2015 459
koeln2015 483
koeln2015 627
koeln2015 630
koeln2015 630
koeln2015 249
koeln2015 249
koeln20

raddialog-moers 1758
raddialog-moers 1760
raddialog-moers 1762
raddialog-moers 1757
raddialog-moers 1772
raddialog-moers 1772
raddialog-moers 1773
raddialog-moers 1774
raddialog-moers 1771
raddialog-moers 1771
raddialog-moers 1771
raddialog-moers 1768
raddialog-moers 1770
raddialog-moers 1770
raddialog-moers 1769
raddialog-moers 1769
raddialog-moers 1775
raddialog-moers 1776
raddialog-moers 1767
raddialog-moers 1777
raddialog-moers 1787
raddialog-moers 1786
raddialog-moers 1783
raddialog-moers 1784
raddialog-moers 1785
raddialog-moers 1785
raddialog-moers 1779
raddialog-moers 1779
raddialog-moers 1780
raddialog-moers 1778
raddialog-moers 1795
raddialog-moers 1794
raddialog-moers 1796
raddialog-moers 1790
raddialog-moers 1788
raddialog-moers 1792
raddialog-moers 1798
raddialog-moers 1798
raddialog-moers 1791
raddialog-moers 1793
raddialog-moers 1793
raddialog-moers 1789
raddialog-moers 1789
raddialog-moers 1782
raddialog-moers 1782
raddialog-moers 1807
raddialog-moers 1808
raddialog-moe

raddialog-bonn 1792
raddialog-bonn 1792
raddialog-bonn 1792
raddialog-bonn 1794
raddialog-bonn 1793
raddialog-bonn 1793
raddialog-bonn 1793
raddialog-bonn 1795
raddialog-bonn 1795
raddialog-bonn 1798
raddialog-bonn 1796
raddialog-bonn 1796
raddialog-bonn 1796
raddialog-bonn 1797
raddialog-bonn 1797
raddialog-bonn 1797
raddialog-bonn 1797
raddialog-bonn 1802
raddialog-bonn 1810
raddialog-bonn 1810
raddialog-bonn 1809
raddialog-bonn 1811
raddialog-bonn 1804
raddialog-bonn 1804
raddialog-bonn 1805
raddialog-bonn 1805
raddialog-bonn 1805
raddialog-bonn 1803
raddialog-bonn 1803
raddialog-bonn 1803
raddialog-bonn 1803
raddialog-bonn 1803
raddialog-bonn 1806
raddialog-bonn 1806
raddialog-bonn 1807
raddialog-bonn 1807
raddialog-bonn 1808
raddialog-bonn 1812
raddialog-bonn 1812
raddialog-bonn 1821
raddialog-bonn 1817
raddialog-bonn 1818
raddialog-bonn 1815
raddialog-bonn 1815
raddialog-bonn 1814
raddialog-bonn 1814
raddialog-bonn 1816
raddialog-bonn 1816
raddialog-bonn 1816
raddialog-bonn 1816


raddialog-bonn 2508
raddialog-bonn 2508
raddialog-bonn 2508
raddialog-bonn 2505
raddialog-bonn 2505
raddialog-bonn 2503
raddialog-bonn 2503
raddialog-bonn 2502
raddialog-bonn 2502
raddialog-bonn 2502
raddialog-bonn 2504
raddialog-bonn 2504
raddialog-bonn 2515
raddialog-bonn 2515
raddialog-bonn 2515
raddialog-bonn 2511
raddialog-bonn 2511
raddialog-bonn 2518
raddialog-bonn 2519
raddialog-bonn 2519
raddialog-bonn 2516
raddialog-bonn 2516
raddialog-bonn 2520
raddialog-bonn 2520
raddialog-bonn 2520
raddialog-bonn 2517
raddialog-bonn 2517
raddialog-bonn 2517
raddialog-bonn 2513
raddialog-bonn 2513
raddialog-bonn 2514
raddialog-bonn 2512
raddialog-bonn 2521
raddialog-bonn 2521
raddialog-bonn 2529
raddialog-bonn 2526
raddialog-bonn 2526
raddialog-bonn 2526
raddialog-bonn 2526
raddialog-bonn 2531
raddialog-bonn 2528
raddialog-bonn 2528
raddialog-bonn 2528
raddialog-bonn 2528
raddialog-bonn 2530
raddialog-bonn 2530
raddialog-bonn 2530
raddialog-bonn 2530
raddialog-bonn 2530
raddialog-bonn 2530


raddialog-bonn 3336
raddialog-bonn 3333
raddialog-bonn 3334
raddialog-bonn 3334
raddialog-bonn 3342
raddialog-bonn 3342
raddialog-bonn 3350
raddialog-bonn 3350
raddialog-bonn 3351
raddialog-bonn 3353
raddialog-bonn 3353
raddialog-bonn 3352
raddialog-bonn 3345
raddialog-bonn 3349
raddialog-bonn 3346
raddialog-bonn 3346
raddialog-bonn 3344
raddialog-bonn 3343
raddialog-bonn 3343
raddialog-bonn 3343
raddialog-bonn 3363
raddialog-bonn 3363
raddialog-bonn 3363
raddialog-bonn 3363
raddialog-bonn 3364
raddialog-bonn 3364
raddialog-bonn 3354
raddialog-bonn 3357
raddialog-bonn 3357
raddialog-bonn 3355
raddialog-bonn 3362
raddialog-bonn 3356
raddialog-bonn 3361
raddialog-bonn 3361
raddialog-bonn 3359
raddialog-bonn 3358
raddialog-bonn 3358
raddialog-bonn 3365
raddialog-bonn 3369
raddialog-bonn 3377
raddialog-bonn 3376
raddialog-bonn 3376
raddialog-bonn 3376
raddialog-bonn 3376
raddialog-bonn 3374
raddialog-bonn 3374
raddialog-bonn 3374
raddialog-bonn 3375
raddialog-bonn 3373
raddialog-bonn 3368


In [None]:
# build spacy doc
documents = []

for d in data[:20]:
    raw = d['title'] + ' .\n' + d['content']
    doc = nlp(raw)
    doc.user_data['title'] = d['title']
    # print(doc)

    doc_proc = process_doc(doc)
    # print(doc_proc)
    # print()
    documents.append(doc_proc)

In [None]:
# gensim phrase extraction --- example

from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser

sentences = Text8Corpus(datapath('testcorpus.txt'))
for s in sentences:
    print(s)
phrases = Phrases(sentences, min_count=1, threshold=1)  # train model
phrases[[u'trees', u'graph', u'minors']]  # apply model to sentence

phrases.add_vocab([["hello", "world"], ["meow"]])  # update model with new sentences
bigram = Phraser(phrases)  # construct faster model (this is only an wrapper)
bigram[[u'trees', u'graph', u'minors']]  # apply model to sentence

for sent in bigram[sentences]:  # apply model to text corpus
     pass

###Named Entity Recognition
Not shure if I want to use these

In [None]:
ents = []

for ent in doc.ents:
    ents.append((ent.text, ent.start_char, ent.end_char, ent.label_))
    
df_ent = pd.DataFrame(ents)
df_ent = df_ent.rename(columns=
                       {0: "Text", 1: "Start", 2: "End", 3: "Label", 
                        4: "Description"})

tprint(df_ent, 10)

###Noun chunks
May be replaced with phrases.

In [None]:
from spacy.attrs import LOWER, POS, LEMMA
doc_array = doc.to_array([LOWER, POS, LEMMA])
#doc_array

In [None]:
doc[0].similarity(doc[1])

In [None]:
# spellchecking
# may be used to correct case errors -> very limited usage

spellchecker = hunspell.HunSpell('/usr/share/hunspell/de_DE.dic', 
                                 '/usr/share/hunspell/de_DE.aff')
enc = spellchecker.get_dic_encoding()  # 'ISO8859-1' might be an issue

df_noun = df_doc[df_doc.POS == 'NOUN'].copy()
df_noun['Spell'] = \
    df_noun['Text'].map(lambda noun: spellchecker.spell(noun))
df_noun['Suggest'] = \
    df_noun['Text'].map(lambda noun: spellchecker.suggest(noun))

tprint(df_noun[['Text', 'Spell', 'Suggest']])

## Alternative:
from spacy_hunspell import spaCyHunSpell

nlp = spacy.load('en_core_web_sm')
hunspell = spaCyHunSpell(nlp, 'mac')
nlp.add_pipe(hunspell)
doc = nlp('I can haz cheezeburger.')
haz = doc[2]
haz._.hunspell_spell  # False
haz._.hunspell_suggest

In [None]:
from spacy_lookup import Entity

#nlp = spacy.load('en')
entity = Entity(nlp, keywords_list=['python', 'java platform'])
nlp.add_pipe(entity, last=True)

doc = nlp(u"I am a product manager for a java and python.")
assert doc._.has_entities == True
assert doc[2:5]._.has_entities == True
assert doc[0]._.is_entity == False
assert doc[3]._.is_entity == True
print(doc._.entities)

In [None]:
# Phrase