In [1]:
from constants import DATA_BASE, ETL_PATH, DATASETS, NOUN_PATTERN, DSETS, POS_N
from utils import load, tprint, multiload
import pandas as pd
from os import listdir
from os.path import isfile, join
import json
import re
import gc
from etl_wikipedia import split_title
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
pd.options.display.max_rows = 2000

-----

### Create minimized lemmatization maps for all datasets

reduced 1:1 lemmatization maps, ready to use.

In [2]:
def get_best_text(grp):
    token: str = grp.name
    texts = grp.text.values
    counts = grp.counts.values
    # If there is only one option, return it (avoid unnecessary checks).
    if len(texts) == 1:
        return texts[0]
    # If the token is fully uppercase and contained in texts we assume it is a common abbreviation 
    # or named entity like 'ABBA' which is supposed to be written uppercase: return the token.
    if token.isupper() and token in texts:
        return token
    # Else: remove all options less than max-count.
    texts = texts[counts == counts[0]]
    # Another shortcut to avoid unnecessary checks.
    if len(texts) == 1:
        return texts[0]
    # If the token is contained in the remaining options, return it.
    if token in texts:
        return token
    # Else: all remaining options are equally likely, return the first.
    return texts[0]

# the following regex are mainly there in order to reduce the vocabulary-size of the dewac corpus
digits = r'[0-9.,/=:;&#\!\?\*"\'\-\(\)\[\]]+'
web =    r'.*?(http:|www\.|\.html|\.htm|\.php|\.de|\.net|\.com|\.at|\.org|\.info).*'
start =  r'[/&\-ï",\'\$\(\)\*\.]+.*'
end =    r'.[\.\(\)¬]*'
badasc = r'.*?[].*'
pat = re.compile(
    r'^(' + '|'.join([
        digits, 
        web,
        start, 
        end, 
        badasc
    ]) + r')$', 
    flags=re.IGNORECASE
)

def generate_and_save_map(df, dataset, save=True):
    df = df.to_frame().rename(columns={'text': 'counts'}).reset_index()
    df = df.groupby('token').progress_apply(get_best_text)
    if save:
        file = f'{DSETS.get(dataset, dataset)}_lemmatization_map.pickle'
        print(f'Writing {file}')
        df.to_pickle(join(ETL_PATH, file))
    else:
        return df

In [None]:
# Generate map for single file datasets (+ news and speeches)
dataset = 'dewa1'
df = load(dataset, 'nlp')
df = df[df.POS.isin(POS_N)]
df = df[~df.token.str.match(pat)]
df = df.set_index('token').text
df = df.groupby('token').value_counts()
df = df[df > 1]

In [3]:
df
#generate_and_save_map(df, dataset)

Reading ../data/preprocessed/nlp/dewac_01_nlp.pickle


token                                    text                                    
0,08mg/l                                 0,08mg/l                                       2
0,2µT                                    0,2µT                                          5
0,5-L-Dose                               0,5-l-Dosen                                    2
0,5-Promille-Grenze                      0,5-Promille-Grenze                            2
0,5l                                     0,5l                                           6
0-Ton                                    0-Ton                                          4
000-Euro-Darlehen                        000-Euro-Darlehen                              2
0049/6731/94700&ndash;0                  0049/6731/94700&ndash;0                        2
0049/6731/94700&ndash;33                 0049/6731/94700&ndash;33                       3
00GEN                                    00GEN                                          6
01.August         

In [None]:
# Generate map for single file datasets (+ news and speeches)
datasets = ['dewa1', 'E', 'FA', 'FO', 'O', 'P', 'N', 'S']
for dataset in datasets:
    df = load(dataset, 'nlp')
    df = df[df.POS.isin(POS_N)]
    df = df[~df.token.str.match(pat)]
    df = df.set_index('token').text
    df = df.groupby('token').value_counts()
    df = df[df > 1]
    generate_and_save_map(df, dataset)

In [None]:
# Generate map for multi file datasets
datasets = ['dewac', 'dewiki']
df = None
for dataset in datasets[:1]:
    series = []
    for df in multiload(dataset, 'nlp'):
        gc.collect()
        df = df[df.POS.isin(POS_N)]
        df = df[~df.token.str.match(pat)]
        df = df.set_index('token').text
        series.append(df)
        gc.collect()
    df = pd.concat(series)
    df = df.groupby('token').value_counts()
    df = df[df > 1]
    generate_and_save_map(df, dataset)

In [None]:
# Generate map for Wikipedia title phrases
df = load('phrases').set_index('token').text.groupby('token').value_counts()
generate_and_save_map(df, 'dewiki_phrases')

In [None]:
# Generate map from Wiktionary data (parsed by IWNLP wiktionary-parser)
iwnlp_file = join(DATA_BASE, 'IWNLP.Lemmatizer_20170501.json')
with open(iwnlp_file, 'r') as fp:
    iwnlp = json.load(fp)

df = pd.DataFrame.from_records(iwnlp)
x = df.Lemmas.apply(pd.Series)
df = (
    df
    .join(x)
    .set_index('Form', append=True)
    .drop('Lemmas', axis=1)
    .stack()
)

forms = pd.DataFrame.from_records(df.values).astype({'POS':'category'})
forms.index = df.index
df = forms.reset_index(drop=True).query('POS == "Noun"')
df = df.rename(columns={'Lemma':'token', 'Form':'text'}).set_index('token')
df = df.text.groupby('token').value_counts()
generate_and_save_map(df, 'wiktionary')

-----

### Create disambiguation and redicrect mappings

In [None]:
phrases = load('phrases')
meta = load('meta')
links = load('links')
cat = load('categories')

In [None]:
# this cell corrects a bug that occured when generating the links table (the original code has already been fixed)
linklenmask = (links.link.str.len() == 1)
links_fix = links[linklenmask]

def re_join(column):
    if column.name in {'norm', 'category', 'hash_nlp'}:
        return column.values[0]
    else:
        return ''.join(column)

links_fix = links_fix.groupby('hash_nlp', sort=False, as_index=False).agg(re_join)
links = links[~linklenmask]
links = links.append(links_fix, sort=False)
links.to_pickle(join(ETL_PATH, 'dewiki_links.pickle'))
links

In [None]:
def restore_orig_title(row):
    if row.description is None:
        return row.title
    else:
        return f'{row.title} ({row.description})'

meta_orig_title = meta[['title', 'description']].apply(restore_orig_title, axis=1)
meta_orig_title_swap = pd.Series(meta_orig_title.index.values, index=meta_orig_title)

In [None]:
import spacy
from spacy.tokens import Token
from lemmatizer_plus import LemmatizerPlus

lemmatizer_path='../data/IWNLP.Lemmatizer_20170501.json'
nlp = spacy.load('de', disable=['parser'])
lemmatizer = LemmatizerPlus(lemmatizer_path, nlp)
nlp.add_pipe(lemmatizer)

def lemmatize(title):
    if not ' ' in title:
        return title, title
    else:
        doc = nlp(title)
        text = []
        token = []
        for t in doc:
            text.append(str(t.text))
            if t._.iwnlp_lemmas is not None:
                token.append(t._.iwnlp_lemmas)
            elif t.lemma_ is not None:
                token.append(t.lemma_)
            else:
                token.append(t.text)
        return '_'.join(text), '_'.join(token)

In [None]:
# building a disambiguation map. Can be used for synonyms
disamb = meta[meta.description == 'Begriffsklärung']
disamb = links.join(disamb, on='hash_nlp', how='inner').drop(['doc_id', 'description', 'length', 'doc_subid'], axis=1)
disamb = disamb.rename(columns={'hash_nlp': 'hash_disamb_page'})
disamb = disamb[['title', 'hash_disamb_page', 'link', 'norm', 'category']]
disamb['hash_link'] = disamb.link.map(lambda x: meta_orig_title_swap[x] if x in meta_orig_title_swap.index else 0)
disamb = disamb.set_index(['title', 'hash_disamb_page'])
tmp = disamb.link.apply(lambda x: pd.Series(split_title(x)))
disamb['title'], disamb['category'] = tmp[0], tmp[1]
disamb = disamb[['title', 'category', 'hash_link']]
disamb = disamb.rename(columns={'title': 'link_to'})
disamb = disamb.reset_index(drop=False)
tmp = disamb.title.apply(lambda x: pd.Series(lemmatize(x)))
disamb['title'], disamb['token'] = tmp[0], tmp[1]
disamb = disamb.set_index(['hash_disamb_page', 'title', 'token'])
disamb['idx'] = disamb.groupby(['hash_disamb_page', 'title', 'token']).cumcount()
disamb = disamb.set_index('idx', append=True)
disamb.to_pickle(join(ETL_PATH, 'dewiki_disambiguation.pickle'))
disamb

In [None]:
# removes null values from categories
cat = cat[~cat.category.isnull()]
cat = cat.set_index('hash_nlp')
cat.to_pickle(join(ETL_PATH, 'dewiki_categories.pickle'))
load('categories')

-----

In [None]:
def collect_redirects(x):
    redir = x[x.subset == 'REDIRECT']
    redir = redir[['doc_id', 'title', 'description', 'tags']]

    def restore_title(tpl):
        if tpl[1] is None:
            return tpl[0]
        else:
            return f'{tpl[0]} ({tpl[1]})'

    redir['directs_to'] = redir.tags.map(restore_title)
    redir['hash_directs_to'] = redir.directs_to.map(lambda x: meta_orig_title_swap[x] if x in meta_orig_title_swap.index else 0)
    redir = redir.drop('tags', axis=1)
    return redir

dep_path = join(ETL_PATH, 'deprecated')
pattern = re.compile(r'dewiki_\d')
files = sorted([f for f in listdir(dep_path) if (isfile(join(dep_path, f)) and pattern.match(f))])
redirects = []
for file in files:
    gc.collect()
    f = join(dep_path, file)
    print('Reading', f)
    df = pd.read_pickle(f)
    df = collect_redirects(df)
    redirects.append(df)
    gc.collect()
redirects = pd.concat(redirects)
rtmp = redirects.title.progress_apply(lambda x: pd.Series(lemmatize(x)))
redirects.title, redirects['token'] = rtmp[0], rtmp[1]
redirects = redirects[['doc_id', 'title', 'token', 'description', 'directs_to', 'hash_directs_to']]
redirects.to_pickle(join(ETL_PATH, 'dewiki_redirects.pickle'))