### Default Imports

In [None]:
import spacy
import json
import pandas as pd
import hunspell
from tabulate import tabulate
from os import listdir, makedirs
from os.path import isfile, join
import pickle


In [None]:
### --- default constants definitions ---

DATA_BASE = "../../master_cloud/corpora"
ETL_BASE = "preprocessed"
ETL_PATH = join(DATA_BASE, ETL_BASE)
NLP_BASE = "preprocessed/nlp"
NLP_PATH = join(DATA_BASE, NLP_BASE)

# standard meta data fields
DATASET = 'dataset'
SUBSET = 'subset'
ID = 'doc_id'
ID2 = 'doc_subid'
TITLE = 'title'
TAGS = 'tags'
TIME = 'date_time'
#AUTHOR
#SUBTITLE
#CATEGORY
META = [DATASET, SUBSET, ID, ID2, TITLE, TAGS, TIME]
TEXT = 'text'
HASH = 'hash'

### --- additional constants

# tags
PUNCT = 'PUNCT'
DET = 'DET'
PHRASE = 'PHRASE'

# keys
IWNLP = 'IWNLP'
POS = 'POS'
INDEX = 'index'
START = 'start'
NOUN = 'NOUN'
PROPN = 'PROPN'
LEMMA = 'lemma'
TAG = 'tag'
STOP = 'stop'
ENT_TYPE = 'ent_type'
ENT_IOB = 'ent_iob'
KNOWN = 'known'


In [None]:
### --- load spacy and iwnlp

nlp = spacy.load('de')  # <-- load with dependency parser (slower)
# nlp = spacy.load('de', disable=['parser'])

from iwnlp.iwnlp_wrapper import IWNLPWrapper
lemmatizer = IWNLPWrapper(lemmatizer_path='../data/IWNLP.Lemmatizer_20170501.json')

In [None]:
# --- definitions and reading for certain corpus

LOCAL_PATH = ETL_BASE
FULL_PATH = join(DATA_BASE, LOCAL_PATH)

#CORPUS = "OnlineParticipation"
#CORPUS = "PoliticalSpeeches"
CORPUS = "FAZ"
files = sorted([f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f)) if f[:3] == CORPUS[:3]])

for name in files:
    fname = join(FULL_PATH, name)
    # change here for multi-file corpora
    df = read(fname)
    print(fname)
    

In [None]:
docs, phrase_lookups = zip(*[tple for tple in process_docs(df[TEXT], size=None)])
docs = pd.concat(docs).reset_index(drop=True)
phrase_lookups = pd.concat(phrase_lookups).reset_index(drop=True)
print("done")

In [None]:
fname = store(CORPUS+'_nlp', docs)
fname = store(CORPUS+'_phrase_lookups', phrase_lookups)
nlp.to_disk(join(NLP_PATH, 'spacy_model'))
#nlp.vocab.to_disk(join(NLP_PATH, 'vocab'))