### Default Imports

In [15]:
import spacy
import json
import pandas as pd
import hunspell
from tabulate import tabulate
from os import listdir, makedirs
from os.path import isfile, join
import pickle


# pretty print of dataframes on the console (slightly better in PyCharm notebooks than default print)
#def tprint(df: pd.DataFrame, head=0):
#    if head > 0:
#        df = df.head(head)
#    elif head < 0:
#        df = df.tail(-head)
#    print(tabulate(df, headers="keys", tablefmt="pipe") + '\n')

In [207]:
### --- default constants definitions ---

DATA_BASE = "../../master_cloud/corpora"
ETL_BASE = "preprocessed"
ETL_PATH = join(DATA_BASE, ETL_BASE)
NLP_BASE = "preprocessed/nlp"
NLP_PATH = join(DATA_BASE, NLP_BASE)

# standard meta data fields
DATASET = 'dataset'
SUBSET = 'subset'
ID = 'doc_id'
ID2 = 'doc_subid'
TITLE = 'title'
TAGS = 'tags'
TIME = 'date_time'
#AUTHOR
#SUBTITLE
#CATEGORY
META = [DATASET, SUBSET, ID, ID2, TITLE, TAGS, TIME]
TEXT = 'text'
HASH = 'hash'

### --- additional constants

# tags
PUNCT = 'PUNCT'
DET = 'DET'
PHRASE = 'PHRASE'

# keys
IWNLP = 'IWNLP'
POS = 'POS'
INDEX = 'index'
START = 'start'
NOUN = 'NOUN'
PROPN = 'PROPN'
LEMMA = 'lemma'
TAG = 'tag'
STOP = 'stop'
ENT_TYPE = 'ent_type'
ENT_IOB = 'ent_iob'
KNOWN = 'known'


In [11]:
### --- load spacy and iwnlp

nlp = spacy.load('de')  # <-- load with dependency parser (slower)
# nlp = spacy.load('de', disable=['parser'])

from iwnlp.iwnlp_wrapper import IWNLPWrapper
lemmatizer = IWNLPWrapper(lemmatizer_path='../data/IWNLP.Lemmatizer_20170501.json')

In [194]:
### --- function definitions ---

def process_phrases(doc_):
    """ 
        given a doc process and return the contained noun phrases.
        This function is based on spacy's noun chunk detection. 
        It also creates items for a global phrase lookup table, which are currently not used.
    """

    # clean the noun chuncs from spacy first
    noun_chunks = []
    for chunk in doc_.noun_chunks:
        start = False
        noun_chunk = []
        for token in chunk:
            # exclude punctuation
            if token.pos_ == PUNCT:
                continue
            # exclude leading determiners
            if not start and (token.pos_ == DET or token.is_stop):
                continue
            start = True
            noun_chunk.append(token)
        if len(noun_chunk) > 1:
            noun_chunks.append(noun_chunk)
    
    # the remaining, adjusted noun chunks will be lemmatized and indexed
    phrase_list_lookup = []
    phrase_list_doc = []
    for chunk in noun_chunks:
        phrase = []
        for token in chunk:
            lemma, _ = lemmatize(token.text, token.pos_)
            if lemma:
                phrase.append(lemma)
            else:
                phrase.append(token.text)
        phrase = ' '.join(phrase)
        text = ' '.join([t.text for t in chunk])
        
        # add to phrase collection of corpus
        phrase_lookup = pd.Series()
        phrase_lookup['lemmatized'] = phrase
        phrase_lookup['original'] = text
        #phrase_lookup['Spacy Tokens'] = tuple(chunk)
        phrase_list_lookup.append(phrase_lookup)
        
        # add to document dataframe
        phrase_series = pd.Series()
        phrase_series[TEXT] = text
        phrase_series[IWNLP] = phrase
        phrase_series[POS] = PHRASE
        phrase_series[INDEX] = chunk[0].i
        phrase_series[START] = chunk[0].idx
        phrase_list_doc.append(phrase_series)

    # return the dataframes and for the doc dataframe and for the global phrase lookup table
    return pd.DataFrame(phrase_list_doc), pd.DataFrame(phrase_list_lookup)


def lemmatize(token: str, pos: str) -> (str, bool):
    """ 
    This function uses the IWNLP lemmatizer with a few enhancements for compund nouns and nouns 
    with uncommon capitalization. Can also be used to lemmatize tokens with different POS-tags.
    Do not use this function to lemmatize phrases.
    :param token: white space stripped single token (str)
    :param pos:   string constant, one of Universal tagset.
    :return: tuple of type (str, bool)
           value[0]: The lemma of the token if a lemma can be derived, else None.
           value[1]: True if the token can be retrieved from the Wiktionary database as is, else False.
    """
    
    if pos == PHRASE:
        try:
            raise ValueError
        except ValueError:
            print("Don't lemmatize Phrases with this function!")
    
    lemm = lemmatizer.lemmatize(token, pos)
    # default lemmatization ok?
    if lemm:
        return lemm[0], True

    # some rules to derive a lemma from the original token (nouns only)
    # TODO: define rules for hyphenated nouns
    if pos == NOUN or pos == PROPN:
        # first try default noun capitalization
        lemm = lemmatizer.lemmatize(token.title(), pos)
        if lemm:
            return lemm[0], False

    # still no results: try noun suffixes
        for i in range(1, len(token)-1):
            token_edit = token[i:].title()
            lemm = lemmatizer.lemmatize_plain(token_edit, ignore_case=True)
            if lemm:
                lemm = lemm[0]
                lemm = token[:i].title() + lemm.lower()
                return lemm, False
    
    # sorry, no results found:
    return None, False


def essence_from_doc(doc_, key):
    """
    Creates a pandas DataFrame from a given spacy.doc that contains only nouns and noun phrases.
    :param doc_: spacy.doc 
    :return:     pandas.DataFrame
    """
    tags = [
        (
         token.text, token.lemma_, token.pos_, token.tag_, token.is_stop,
         token.i, token.idx,
         token.ent_type_, token.ent_iob_, # token.ent_id_,
         ) for token in doc_ ]
    df_ = pd.DataFrame(tags)
    df_ = df_.rename(columns={k:v for k,v in enumerate([
          TEXT, LEMMA, POS, TAG, STOP, INDEX, START, ENT_TYPE, ENT_IOB,
          #"Dep", "Shape", "alpha", "Ent_id"  # currently not used :(
    ])})
    
    # add IWNLP lemmatization
    df_[IWNLP], df_[KNOWN] = zip(*df_.apply(lambda row: lemmatize(row[TEXT], row[POS]), axis=1))
    
    # add phrases
    df_phrases, phrase_lookup = process_phrases(doc_)
    df_ = df_.append(df_phrases).sort_values(START)
    df_ = df_[df_.POS.isin([NOUN, PROPN, PHRASE])].reset_index(drop=True)
    
    # replace Text with lemmatization, if lemmatization exists
    mask = ~df_[IWNLP].isnull()
    df_.loc[mask, TEXT] = df_.loc[mask, IWNLP]
    
    # add hash-key
    df_[HASH] = key
    
    return df_[[HASH, INDEX, TEXT, POS]], phrase_lookup


def process_docs(series, size=None):
    """ main function for sending the dataframes from the ETL pipeline to the NLP pipeline """
    for k, v in series[:size].iteritems():
        # build spacy doc
        doc = nlp(v)
        essential_token, phrase_lookup = essence_from_doc(doc, key=k)
        yield essential_token, phrase_lookup
        
        
def store(corpus, df):
    """returns the file path where the dataframe was stores"""
    makedirs(NLP_PATH, exist_ok=True)
    fname = join(NLP_PATH, corpus + '.pickle')
    print('saving to', fname)
    df.to_pickle(fname)
    return fname


def read(f):
    """ reads a dataframe from pickle format """
    return pd.read_pickle(f)


In [212]:
# --- definitions and reading for certain corpus

LOCAL_PATH = ETL_BASE
FULL_PATH = join(DATA_BASE, LOCAL_PATH)

#CORPUS = "OnlineParticipation"
#CORPUS = "PoliticalSpeeches"
CORPUS = "FAZ"
files = sorted([f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f)) if f[:3] == CORPUS[:3]])

for name in files:
    fname = join(FULL_PATH, name)
    # change here for multi-file corpora
    df = read(fname)
    print(fname)
    

../../master_cloud/corpora/preprocessed/FAZ.pickle


In [213]:
docs, phrase_lookups = zip(*[tple for tple in process_docs(df[TEXT], size=None)])
docs = pd.concat(docs).reset_index(drop=True)
phrase_lookups = pd.concat(phrase_lookups).reset_index(drop=True)
print("done")

In [214]:
fname = store(CORPUS+'_nlp', docs)
fname = store(CORPUS+'_phrase_lookups', phrase_lookups)
nlp.to_disk(join(NLP_PATH, 'spacy_model'))
#nlp.vocab.to_disk(join(NLP_PATH, 'vocab'))

saving to ../../master_cloud/corpora/preprocessed/nlp/FAZ_nlp.pickle
saving to ../../master_cloud/corpora/preprocessed/nlp/FAZ_phrase_lookups.pickle


### Named Entity Recognition
Not shure if I want to use these

In [None]:
ents = []

for ent in doc.ents:
    ents.append((ent.text, ent.start_char, ent.end_char, ent.label_))
    
df_ent = pd.DataFrame(ents)
df_ent = df_ent.rename(columns=
                       {0: "Text", 1: "Start", 2: "End", 3: "Label", 
                        4: "Description"})

tprint(df_ent, 10)

In [None]:
from spacy_lookup import Entity

#nlp = spacy.load('en')
entity = Entity(nlp, keywords_list=['python', 'java platform'])
nlp.add_pipe(entity, last=True)

doc = nlp(u"I am a product manager for a java and python.")
assert doc._.has_entities == True
assert doc[2:5]._.has_entities == True
assert doc[0]._.is_entity == False
assert doc[3]._.is_entity == True
print(doc._.entities)

In [None]:
# spellchecking
# may be used to correct case errors -> very limited usage

spellchecker = hunspell.HunSpell('/usr/share/hunspell/de_DE.dic', 
                                 '/usr/share/hunspell/de_DE.aff')
enc = spellchecker.get_dic_encoding()  # 'ISO8859-1' might be an issue

df_noun = df_doc[df_doc.POS == 'NOUN'].copy()
df_noun['Spell'] = \
    df_noun['Text'].map(lambda noun: spellchecker.spell(noun))
df_noun['Suggest'] = \
    df_noun['Text'].map(lambda noun: spellchecker.suggest(noun))

tprint(df_noun[['Text', 'Spell', 'Suggest']])

## Alternative:
from spacy_hunspell import spaCyHunSpell

nlp = spacy.load('en_core_web_sm')
hunspell = spaCyHunSpell(nlp, 'mac')
nlp.add_pipe(hunspell)
doc = nlp('I can haz cheezeburger.')
haz = doc[2]
haz._.hunspell_spell  # False
haz._.hunspell_suggest