In [1]:
from os.path import join
import numpy as np
import pandas as pd
import re
from spacy.symbols import IDS
from constants import *

In [2]:
corpus = 'OnlineParticipation'
#Europarl
#FAZ
#FOCUS
#PoliticalSpeeches
fpath = join(NLP_PATH, corpus + '_nlp.pickle')
df = pd.read_pickle(fpath)
#df

In [3]:
def concat_entities(column):
    if column.name in {HASH, SENT_IDX, ENT_IDX, ENT_TYPE}:
        return column.values[0]
    if column.name in {'tok_idx', NOUN_PHRASE, 'np'}:
        return tuple(column.values)
    if column.name in {TEXT, TOKEN}:
        return column.str.cat(sep='_')
    return False

def get_length(tpl):
    return 

def get_removable_tokens(df_in):
    remove_token = []
    for i, sent_idx, tok_set in df_in.itertuples():
        for tok_idx in tok_set:
            remove_token.append((sent_idx, tok_idx))
    df_out = pd.DataFrame.from_records(remove_token, columns=[SENT_IDX, TOK_IDX])
    return df_out

# add phrases and replace overlapping tokens
def insert_phrases(df_orig, df_insert):
    # this DataFrame contains all token-idx we want to replace with phrases
    df_removable_tokens = get_removable_tokens(df_insert[[SENT_IDX, 'tok_set']])
    df_combined = (
        df_orig
        # remove original unigram tokens
        .append(df_removable_tokens)
        .drop_duplicates(subset=[SENT_IDX, TOK_IDX], keep=False)
        .dropna(subset=[HASH])
        # insert concatenated phrase tokens
        .append(df_insert)
        .sort_values([SENT_IDX, TOK_IDX])
    )
    return df_combined

In [4]:
# extracting spacy NER
df_ent = (
    df
    .query('ent_idx > 0 & POS != "SPACE"')  # phrases have an ent-index > 0 and we don't care about whitespace
    .groupby(ENT_IDX).filter(lambda x: len(x) > 1)  # we case only about entities greater than 1 token
    .groupby(ENT_IDX, as_index=False).agg(concat_entities)  # concatenate entities
    .assign(
        length=lambda x: x.tok_idx.apply(lambda y: len(y)),  # add the number of tokens per entity as a new column
        POS='NER',  # annotations
        ent_iob='P',
    )
    .astype({  # set annoation columns as categorical for memory savings
        POS: "category",
        ENT_IOB: "category", 
        ENT_TYPE: "category"
    })
)
#df_ent

In [5]:
# extracting spacy noun chunks
df_np = (
    df
    .query('noun_phrase > 0 & POS not in ["SPACE", "NUM", "DET", "SYM"]')
    .groupby(NOUN_PHRASE).filter(lambda x: len(x) > 1)
    .groupby(NOUN_PHRASE, as_index=False).agg(concat_entities)
    .assign(
        length=lambda x: x.tok_idx.apply(lambda y: len(y)),
        POS='NPHRASE',
        ent_iob='P',
    )
    .astype({
        POS: "category", 
        ENT_IOB: "category", 
        ENT_TYPE: "category"
    })
)
#df_np

In [6]:
# intersecting both extraction methods returns very nice results
df_phrases = df_ent.append(df_np)
mask = df_phrases.duplicated([HASH, SENT_IDX, TOK_IDX])
df_phrases = df_phrases[mask]
# set column token-index to start of phrase and add column column for the token-indexes instead
df_phrases['tok_set'] = df_phrases[TOK_IDX]
df_phrases[TOK_IDX] = df_phrases[TOK_IDX].apply(lambda x: x[0])
#df_phrases

Unnamed: 0,POS,ent_idx,ent_iob,ent_type,hash,length,noun_phrase,sent_idx,text,tok_idx,token,tok_set
4,NPHRASE,7,P,MISC,-1741504268962868006,2,21,4,ca.5_Jahren,10,ca.5_Jahr,"(10, 11)"
19,NPHRASE,26,P,LOC,1112788597393434588,2,74,18,Stadt_Bonn,8,Stadt_Bonn,"(8, 9)"
36,NPHRASE,43,P,PER,-2263838526400175399,2,160,34,Bsp_Altweiber,55,BSP_Altweib,"(55, 56)"
41,NPHRASE,52,P,MISC,-2263838526400175399,2,184,39,SGB_II-Bezieher,141,SGB_Ii-Bezieher,"(141, 142)"
50,NPHRASE,62,P,ORG,3346603037676320710,2,222,46,Sparkasse_Köln,0,Sparkasse_Köln,"(0, 1)"
63,NPHRASE,72,P,LOC,6030682432064371412,2,263,59,Stadt_Bonn,12,Stadt_Bonn,"(12, 13)"
67,NPHRASE,74,P,PER,6030682432064371412,2,270,61,Martin_Nötzel,35,Martin_Nötzel,"(35, 36)"
71,NPHRASE,76,P,LOC,6030682432064371412,2,277,63,Stadt_Bonn,85,Stadt_Bonn,"(85, 86)"
78,NPHRASE,88,P,LOC,-4698369755082196440,2,327,72,Stadt_Bonn,10,Stadt_Bonn,"(10, 11)"
90,NPHRASE,92,P,LOC,-805735878397083349,2,351,76,Stadt_Bonn,8,Stadt_Bonn,"(8, 9)"


In [7]:
# based on Philipp Grawes approach on extracting and normalizing street names
STREET_NAME_LIST = [r'strasse$', r'straße$', r'str$', r'str.$', r'platz', r'gasse$', r'allee$', r'ufer$', r'weg$']
STREET_NAMES = re.compile(r'(' + '|'.join(STREET_NAME_LIST) + ')', re.IGNORECASE)
STREET_PATTERN = re.compile(r"str(\.|a(ss|ß)e)?\b", re.IGNORECASE)
SPECIAL_CHAR = re.compile(r'[^\w&\/]+')

def aggregate_streets(column):
    if column.name in {HASH, SENT_IDX, ENT_IDX}:
        return column.values[0]
    if column.name in {'tok_idx', NOUN_PHRASE, 'np'}:
        return tuple(column.values)
    if column.name == TEXT:
        return column.str.cat(sep='_')
    if column.name == TOKEN:
        street_candidate = False
        for k, token in column.iteritems():
            if re.search(STREET_NAMES, token):
                street_candidate = True
        if street_candidate:
            if len(column) == 1 and re.fullmatch(STREET_NAMES, column.values[0]):
                return False
            else:
                street_name = column.str.cat(sep=' ')
                street_name = STREET_PATTERN.sub('straße', street_name)
                street_name = SPECIAL_CHAR.sub('_', street_name)
                street_name = street_name.strip('_').title()
                return street_name
    return False

df_loc = (
    df
    .loc[(df[ENT_IDX] > 0) & (df.POS != SPACE)]
    .groupby(ENT_IDX, as_index=False).agg(aggregate_streets)
    .query('token != False')
    .assign(
        length=lambda x: x.tok_idx.apply(lambda y: len(y)),
        tok_set=lambda x: x.tok_idx,
        tok_idx=lambda x: x.tok_idx.apply(lambda y: y[0]),
        POS='PROPN', 
        ent_iob='L', 
        ent_type='STREET'
    )
    .astype({
        POS: "category", 
        ENT_IOB: "category", 
        ENT_TYPE: "category"
    })
)

In [9]:
# insert phrases to original tokens
df_glued = insert_phrases(df, df_phrases)
# insert locations / streets
df_glued = insert_phrases(df_glued, df_loc)
# simplify dataframe and store
df_glued = (
    df_glued
    .loc[df_glued.POS != 'SPACE', [HASH, POS, SENT_IDX, TOK_IDX, TOKEN]]
    .astype({
        HASH: np.int64,
        POS: "category",
        SENT_IDX: np.int32,
        TOK_IDX: np.int32,
    })
)
write_path = join(ETL_PATH + '/simple', corpus + '_simple.pickle')
df_glued.to_pickle(write_path)

In [11]:
pd.read_pickle(write_path)

Unnamed: 0,hash,POS,sent_idx,tok_idx,token
0,8109220208026988544,NOUN,0,0,Nebentätigkeit
1,8109220208026988544,ADP,0,1,von
2,8109220208026988544,NOUN,0,2,OB
3,8109220208026988544,CONJ,0,3,und
4,8109220208026988544,NOUN,0,4,Kommunalpoilter
5,8109220208026988544,PUNCT,0,5,-
6,8109220208026988544,ADJ,0,6,öffentlich
7,8109220208026988544,NOUN,0,7,Ehrenamt
9,8109220208026988544,ADP,1,9,durch
10,8109220208026988544,DET,1,10,der
