### Default Imports

In [1]:
import spacy
import pandas as pd
import sys
import re
from os import listdir, makedirs
from os.path import isfile, join, exists
from iwnlp.iwnlp_wrapper import IWNLPWrapper

In [2]:
### --- default constants definitions ---

DATA_BASE = "../../master_cloud/corpora"
ETL_BASE = "preprocessed"
ETL_PATH = join(DATA_BASE, ETL_BASE)
NLP_BASE = "preprocessed/nlp"
NLP_PATH = join(DATA_BASE, NLP_BASE)
SPACY_PATH = join(NLP_PATH, 'spacy_model')
VOCAB_PATH = join(SPACY_PATH, 'vocab')

# standard meta data fields
DATASET = 'dataset'
SUBSET = 'subset'
ID = 'doc_id'
ID2 = 'doc_subid'
TITLE = 'title'
TAGS = 'tags'
TIME = 'date_time'
# AUTHOR
# SUBTITLE
# CATEGORY
META = [DATASET, SUBSET, ID, ID2, TITLE, TAGS, TIME]
TEXT = 'text'
HASH = 'hash'

### --- additional constants

# tags
PUNCT = 'PUNCT'
DET = 'DET'
PHRASE = 'PHRASE'

# keys
IWNLP = 'IWNLP'
POS = 'POS'
INDEX = 'index'
START = 'start'
NOUN = 'NOUN'
PROPN = 'PROPN'
LEMMA = 'lemma'
TAG = 'tag'
STOP = 'stop'
ENT_TYPE = 'ent_type'
ENT_IOB = 'ent_iob'
KNOWN = 'known'

In [3]:
### --- load spacy and iwnlp ---

if len(sys.argv) > 1 and sys.argv[1] == '--hpc':
    print('on hpc')
    de = '/home/funkea/.local/lib/python3.4/site-packages/de_core_news_sm/de_core_news_sm-2.0.0'
else:
    de = 'de'

print("loading spacy")
nlp = spacy.load(de)  # <-- load with dependency parser (slower)
# nlp = spacy.load(de, disable=['parser'])

if exists(VOCAB_PATH):
    print("reading vocab from", VOCAB_PATH)
    nlp.vocab.from_disk(VOCAB_PATH)

print("loading IWNLPWrapper")
lemmatizer = IWNLPWrapper(lemmatizer_path='../data/IWNLP.Lemmatizer_20170501.json')


loading spacy
reading vocab from ../../master_cloud/corpora/preprocessed/nlp/spacy_model/vocab
loading IWNLPWrapper


In [9]:
stopwords = nlp.Defaults.stop_words

In [24]:
# --- run notebook ---

LOCAL_PATH = ETL_BASE
FULL_PATH = join(DATA_BASE, LOCAL_PATH)

files = sorted([f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f))])

def process_docs(series, size=None):
    """ main function for sending the dataframes from the ETL pipeline to the NLP pipeline """
    length = len(series)
    steps = 100
    step_len = 100//steps
    percent = length//steps
    done = 0
    yield essential_token, phrase_lookup

for name in files:
    if name[:3] != 'Onl':
        continue
    corpus = re.split(r'\.|_', name)[0]
    fname = join(FULL_PATH, name)
    df = read(fname)
    
#docs = [[token.text for token in nlp(text)] for text in df[TEXT][:10]]
#docs

docs = [[[token for token in sent] for sent in nlp(text).sents] for text in df[TEXT][:10]]
docs

reading corpus from ../../master_cloud/corpora/preprocessed/OnlineParticipation.pickle


[[[Nebentätigkeiten,
   von,
   OB,
   und,
   Kommunalpoilter,
   -,
   öffentliches,
   Ehrenamt,
   .,
   ],
  [Durch,
   das,
   sogenannte"öffentliche,
   Ehrenamt,
   ",
    ,
   verdienen,
   manche,
   Komunalpolitiker,
   und,
   OB`s,
    ,
   in,
   den,
   Verwaltungsräten,
   und,
   Aufsichtsräten,
   viel,
   Geld,
   (,
   bis,
   zu,
   5-stellig,
   ),
   ],
  [Von,
   diesem,
   Geld,
   muss,
   kein,
   Cent,
   an,
   die,
   Stadtkasse,
   abgeführt,
   werden,
   da,
   es,
   sich,
   um,
   ein,
   sogenanntes,
   öffentliches,
   Ehrenamt,
   handelt,
   .,
   ],
  [Das, gehört, abgeschafft, .....]],
 [[Wache, (, Gabi, ), im, Bahnhof, ., ],
  [Diese,
   wurde,
   vor,
   ca.5,
   Jahren,
   für,
   sehr,
   viel,
   Geld,
   umgebaut,
   und,
   galt,
   als,
   Vorzeigemodell,
   in,
   NRW,
   ],
  [Jetzt,
   soll,
   eine,
   neue,
   Wache,
   gebaut,
   werden,
   ,,
   da,
   der,
   Bahnhofsumbau,
   beginnen,
   soll,
   .,
   ],
  [Läßt, sich, das, n