### Default Imports

In [73]:
import spacy
import pandas as pd
import sys
import re
from os import listdir, makedirs
from os.path import isfile, join, exists
# from iwnlp.iwnlp_wrapper import IWNLPWrapper
from spacy_iwnlp import spaCyIWNLP
import gensim
from time import time

In [2]:
### --- default constants definitions ---

DATA_BASE = "../../master_cloud/corpora"
ETL_BASE = "preprocessed"
ETL_PATH = join(DATA_BASE, ETL_BASE)
NLP_BASE = "preprocessed/nlp"
NLP_PATH = join(DATA_BASE, NLP_BASE)
SPACY_PATH = join(NLP_PATH, 'spacy_model')
VOCAB_PATH = join(SPACY_PATH, 'vocab')

# standard meta data fields
DATASET = 'dataset'
SUBSET = 'subset'
ID = 'doc_id'
ID2 = 'doc_subid'
TITLE = 'title'
TAGS = 'tags'
TIME = 'date_time'
# AUTHOR
# SUBTITLE
# CATEGORY
META = [DATASET, SUBSET, ID, ID2, TITLE, TAGS, TIME]
TEXT = 'text'
HASH = 'hash'

### --- additional constants

# tags
PUNCT = 'PUNCT'
DET = 'DET'
PHRASE = 'PHRASE'

# keys
IWNLP = 'IWNLP'
POS = 'POS'
INDEX = 'index'
START = 'start'
NOUN = 'NOUN'
PROPN = 'PROPN'
LEMMA = 'lemma'
TAG = 'tag'
STOP = 'stop'
ENT_TYPE = 'ent_type'
ENT_IOB = 'ent_iob'
KNOWN = 'known'

In [75]:
### --- load spacy and iwnlp ---

if len(sys.argv) > 1 and sys.argv[1] == '--hpc':
    print('on hpc')
    de = '/home/funkea/.local/lib/python3.4/site-packages/de_core_news_sm/de_core_news_sm-2.0.0'
else:
    de = 'de'

print("loading spacy")
nlp = spacy.load(de)  # <-- load with dependency parser (slower)
# nlp = spacy.load(de, disable=['parser'])

if exists(VOCAB_PATH):
    print("reading vocab from", VOCAB_PATH)
    nlp.vocab.from_disk(VOCAB_PATH)

print("loading IWNLPWrapper")
# lemmatizer = IWNLPWrapper(lemmatizer_path='../data/IWNLP.Lemmatizer_20170501.json')
iwnlp = spaCyIWNLP(lemmatizer_path='../data/IWNLP.Lemmatizer_20170501.json')
nlp.add_pipe(iwnlp)

loading spacy
reading vocab from ../../master_cloud/corpora/preprocessed/nlp/spacy_model/vocab
loading IWNLPWrapper


In [None]:
def iwnlp_plus(doc: spacy.tokens.Doc) -> spacy.tokens.Doc:
    """ 
    This function uses the IWNLP lemmatizer with a few enhancements for compund nouns and nouns 
    with uncommon capitalization. Can also be used to lemmatize tokens with different POS-tags.
    Do not use this function to lemmatize phrases.
    :param token: white space stripped single token (str)
    :param pos:   string constant, one of Universal tagset.
    :return: tuple of type (str, bool)
           value[0]: The lemma of the token if a lemma can be derived, else None.
           value[1]: True if the token can be retrieved from the Wiktionary database as is, else False.
    """
    
    :param token: white space stripped single token (str)
    :param pos:   string constant, one of Universal tagset.
    token = d
    for item in doc:
        token = item.text.strip()
        pos = item.pos_
    
        lemm = lemmatizer.lemmatize(token, pos)
        # default lemmatization ok?
        if lemm:
            return lemm[0], True

        # some rules to derive a lemma from the original token (nouns only)
        # TODO: define rules for hyphenated nouns
        if pos == NOUN or pos == PROPN:
            # first try default noun capitalization
            lemm = lemmatizer.lemmatize(token.title(), pos)
            if lemm:
                return lemm[0], False

        # still no results: try noun suffixes
            for i in range(1, len(token)-1):
                token_edit = token[i:].title()
                lemm = lemmatizer.lemmatize_plain(token_edit, ignore_case=True)
                if lemm:
                    lemm = lemm[0]
                    lemm = token[:i].title() + lemm.lower()
                    return lemm, False
    


nlp.add_pipe(iwnlp_plus, name='iwnlp_plus')

In [52]:
stopwords = nlp.Defaults.stop_words

In [36]:
# --- run notebook ---

LOCAL_PATH = ETL_BASE
FULL_PATH = join(DATA_BASE, LOCAL_PATH)

files = sorted([f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f))])

def read(f):
    """ reads a dataframe from pickle format """
    print("reading corpus from", f)
    return pd.read_pickle(f)

def process_docs(series, size=None):
    """ main function for sending the dataframes from the ETL pipeline to the NLP pipeline """
    length = len(series)
    steps = 100
    step_len = 100//steps
    percent = length//steps
    done = 0
    yield essential_token, phrase_lookup

for name in files:
    if name[:3] != 'Onl':
        continue
    corpus = re.split(r'\.|_', name)[0]
    fname = join(FULL_PATH, name)
    df = read(fname)
    

reading corpus from ../../master_cloud/corpora/preprocessed/OnlineParticipation.pickle


In [35]:
t0 = time()
docs = [nlp(text) for text in df[TEXT]]
df['doc'] = docs
t1 = int(time() - t0)
print("finished in {:02d}:{:02d}:{:02d}".format(t1//3600, t1//60, t1 % 60))
df
# finished in 00:05:36 minutes

finished in 00:05:36 minutes


Unnamed: 0_level_0,dataset,subset,doc_id,doc_subid,title,tags,date_time,text,doc
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
492819066359988780,OnlineParticipation,bonn2017,985,0,Nebentätigkeiten von OB und Kommunalpoilter - ...,,2016-10-06 00:00:00,Nebentätigkeiten von OB und Kommunalpoilter - ...,"(Nebentätigkeiten, von, OB, und, Kommunalpoilt..."
4402470584618071095,OnlineParticipation,bonn2017,988,0,Wache (Gabi) im Bahnhof,,2016-10-06 00:00:00,Wache (Gabi) im Bahnhof .\nDiese wurde vor ca....,"(Wache, (, Gabi, ), im, Bahnhof, ., \n, Diese,..."
6280350261461137971,OnlineParticipation,bonn2017,986,0,Monetäre Sparmaßnahme,Finanzen und Beteiligung,2016-10-06 00:00:00,Monetäre Sparmaßnahme .\nAblösung aller Kredit...,"(Monetäre, Sparmaßnahme, ., \n, Ablösung, alle..."
-3466298431331063864,OnlineParticipation,bonn2017,987,0,Kosteneinsparung bei Friedhof-/Parkanlagen in ...,Finanzen und Beteiligung,2016-10-06 00:00:00,Kosteneinsparung bei Friedhof-/Parkanlagen in ...,"(Kosteneinsparung, bei, Friedhof-/Parkanlagen,..."
-7971046904978124907,OnlineParticipation,bonn2017,983,0,Leerstand,,2016-10-06 00:00:00,Leerstand .\nSiehe Leerstandsmelder und eigene...,"(Leerstand, ., \n, Siehe, Leerstandsmelder, un..."
-5464475141879722540,OnlineParticipation,bonn2017,979,0,Minusstunden bei der Feuerwehr; Öffnungszeiten...,,2016-10-06 00:00:00,Minusstunden bei der Feuerwehr; Öffnungszeiten...,"(Minusstunden, bei, der, Feuerwehr, ;, Öffnung..."
-3377659701997023543,OnlineParticipation,bonn2017,980,0,Stadtverwaltung und Einsparmöglichkeiten,,2016-10-06 00:00:00,Stadtverwaltung und Einsparmöglichkeiten .\nÜb...,"(Stadtverwaltung, und, Einsparmöglichkeiten, ...."
-9081569078624867654,OnlineParticipation,bonn2017,982,0,Aufsichtsrat Stadtwerke Bonn,,2016-10-06 00:00:00,Aufsichtsrat Stadtwerke Bonn .\nIst es wirklic...,"(Aufsichtsrat, Stadtwerke, Bonn, ., \n, Ist, e..."
-6681834770025936415,OnlineParticipation,bonn2017,984,0,Sparkasse Köln Bonn und Ausschüttung der Gewinne,,2016-10-06 00:00:00,Sparkasse Köln Bonn und Ausschüttung der Gewin...,"(Sparkasse, Köln, Bonn, und, Ausschüttung, der..."
5437548212583122831,OnlineParticipation,bonn2017,990,0,Beendigung der Unterstützung der Kunst!Rasen-V...,Freizeit und Sport,2016-10-06 00:00:00,Beendigung der Unterstützung der Kunst!Rasen-V...,"(Beendigung, der, Unterstützung, der, Kunst, !..."


In [77]:
lst = []
for tpl in df.itertuples():
    for i, sent in enumerate(tpl[9].sents):
        sent_list = []
        for token in sent:
            if token._.iwnlp_lemmas:
                lemma = token._.iwnlp_lemmas[0]
            else:
                lemma = token.lemma_
            sent_list.append(lemma)
        t = (tpl[0], i, sent_list)
        lst.append(t)
        
df_new = pd.DataFrame(lst, columns=[HASH, 'sent_idx', 'sentence'])
df_new

Unnamed: 0,hash,sent_idx,sentence
0,492819066359988780,0,"[Nebentätigkeit, von, OB, und, Kommunalpoilter..."
1,492819066359988780,1,"[Durch, der, sogenannte""öffentliche, Ehrenamt,..."
2,492819066359988780,2,"[Von, dies, Geld, müssen, kein, Cent, an, der,..."
3,492819066359988780,3,"[Das, gehören, abschaffen, .....]"
4,4402470584618071095,0,"[Wache, (, Gabi, ), im, Bahnhof, ., \n]"
5,4402470584618071095,1,"[Diese, werden, vor, ca.5, Jahr, für, sehr, vi..."
6,4402470584618071095,2,"[Jetzt, sollen, einen, neu, Wache, bauen, werd..."
7,4402470584618071095,3,"[Läßt, sich, der, nicht, vermeiden, ?]"
8,4402470584618071095,4,[?]
9,4402470584618071095,5,[?]


In [70]:
phrases = gensim.models.phrases.Phrases(
    df_new['sentence'], 
    common_terms=stopwords, 
    min_count=3, 
    # threshold=0, scoring='npmi',
)
bigram = gensim.models.phrases.Phraser(phrases)
texts = [bigram[sentence] for sentence in df_new['sentence']]
texts

[['Nebentätigkeiten',
  'von',
  'OB',
  'und',
  'Kommunalpoilter',
  '-',
  'öffentliches',
  'Ehrenamt',
  '.',
  '\n'],
 ['Durch',
  'das',
  'sogenannte"öffentliche',
  'Ehrenamt',
  '"',
  ' ',
  'verdienen',
  'manche',
  'Komunalpolitiker',
  'und',
  'OB`s',
  ' ',
  'in',
  'den',
  'Verwaltungsräten',
  'und',
  'Aufsichtsräten',
  'viel',
  'Geld',
  '(',
  'bis',
  'zu',
  '5-stellig',
  ')',
  '\n'],
 ['Von',
  'diesem',
  'Geld',
  'muss',
  'kein',
  'Cent',
  'an',
  'die',
  'Stadtkasse',
  'abgeführt',
  'werden',
  'da',
  'es',
  'sich',
  'um',
  'ein',
  'sogenanntes',
  'öffentliches',
  'Ehrenamt',
  'handelt',
  '.',
  '\n'],
 ['Das', 'gehört', 'abgeschafft', '.....'],
 ['Wache', '(', 'Gabi', ')', 'im', 'Bahnhof', '.', '\n'],
 ['Diese',
  'wurde',
  'vor',
  'ca.5',
  'Jahren',
  'für',
  'sehr',
  'viel',
  'Geld',
  'umgebaut',
  'und',
  'galt',
  'als',
  'Vorzeigemodell',
  'in',
  'NRW',
  '\n'],
 ['Jetzt',
  'soll',
  'eine',
  'neue',
  'Wache',
  'geb