In [93]:
from constants import DATA_BASE, ETL_PATH, DATASETS
from utils import load
import pandas as pd
from os import listdir
from os.path import isfile, join
import json
import re
import gc
from etl_wikipedia import split_title
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

-----

### Create a lemmatization map from Wiktionary data

(parsed by IWNLP wiktionary-parser)

In [5]:
iwnlp_file = join(DATA_BASE, 'IWNLP.Lemmatizer_20170501.json')
with open(iwnlp_file, 'r') as fp:
    iwnlp = json.load(fp)
    
iwnlp

[{'Form': 'hallo',
  'Lemmas': [{'POS': 'Noun', 'Form': 'Hallo', 'Lemma': 'Hallo'},
   {'POS': 'X', 'Form': 'hallo', 'Lemma': 'hallo'}]},
 {'Form': 'hallos',
  'Lemmas': [{'POS': 'Noun', 'Form': 'Hallos', 'Lemma': 'Hallo'}]},
 {'Form': 'subfamilia',
  'Lemmas': [{'POS': 'Noun', 'Form': 'Subfamilia', 'Lemma': 'Subfamilia'}]},
 {'Form': 'subfamiliae',
  'Lemmas': [{'POS': 'Noun', 'Form': 'Subfamiliae', 'Lemma': 'Subfamilia'}]},
 {'Form': 'subregnum',
  'Lemmas': [{'POS': 'Noun', 'Form': 'Subregnum', 'Lemma': 'Subregnum'}]},
 {'Form': 'subregna',
  'Lemmas': [{'POS': 'Noun', 'Form': 'Subregna', 'Lemma': 'Subregnum'}]},
 {'Form': 'subregnums',
  'Lemmas': [{'POS': 'Noun', 'Form': 'Subregnums', 'Lemma': 'Subregnum'}]},
 {'Form': 'subdivisio',
  'Lemmas': [{'POS': 'Noun', 'Form': 'Subdivisio', 'Lemma': 'Subdivisio'}]},
 {'Form': 'subdivisiones',
  'Lemmas': [{'POS': 'Noun', 'Form': 'Subdivisiones', 'Lemma': 'Subdivisio'}]},
 {'Form': 'phylum',
  'Lemmas': [{'POS': 'Noun', 'Form': 'Phylum', '

In [None]:
df = pd.DataFrame.from_records(iwnlp)
x = df.Lemmas.apply(pd.Series)
df = (
    df
    .join(x)
    .set_index('Form', append=True)
    .drop('Lemmas', axis=1)
    .stack()
)

forms = pd.DataFrame.from_records(df.values).astype({'POS':'category'})
forms.index = df.index
forms

forms.to_pickle(join(ETL_PATH, 'wiktionary_lemmatization_map.pickle'))

In [3]:
load('wikt', 'lemmap')

Reading ../data/preprocessed/wiktionary_lemmatization_map.pickle


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Form,Lemma,POS
Unnamed: 0_level_1,Form,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,hallo,0,Hallo,Hallo,Noun
0,hallo,1,hallo,hallo,X
1,hallos,0,Hallos,Hallo,Noun
2,subfamilia,0,Subfamilia,Subfamilia,Noun
3,subfamiliae,0,Subfamiliae,Subfamilia,Noun
4,subregnum,0,Subregnum,Subregnum,Noun
5,subregna,0,Subregna,Subregnum,Noun
6,subregnums,0,Subregnums,Subregnum,Noun
7,subdivisio,0,Subdivisio,Subdivisio,Noun
8,subdivisiones,0,Subdivisiones,Subdivisio,Noun


-----

### Create lemmatization maps for all datasets

they will be used to greadily undo lemmatization

In [4]:
pat = re.compile(r'^[A-Za-zÄÖÜäöü].*')

In [None]:
datasets = ['dewa1', 'E', 'FA', 'FO', 'O', 'P']

for dataset in datasets:
    df = load(dataset, 'nlp')
    df = df[df.token.str.match(pat)]
    df = df.groupby('token').apply(lambda x: x.text.value_counts())
    df.to_pickle(join(ETL_PATH, f'{DATASETS[dataset]}_lemmatization_map.pickle'))

In [2]:
load('dewac1', 'lemmap')

Reading ../data/preprocessed/dewac_nbfiles01_lemmatization_map.pickle


token                                           
A                        a                          12868
                         A                           5891
A"-                      A"-                            1
A"-Land                  A"-Land                        1
                         A"-Landes                      1
A$                       A$                            15
A&A                      A&A                           45
A&D                      A&D                            5
A&E                      A&E                            3
A&E-Abteilung            A&E-Abteilung                  1
A&F                      A&F                            1
A&M                      A&M                            5
A&O                      A&O                            6
A&P                      A&P                            1
A&R                      A&R                           56
A&R-                     A&R-                           5
A&R-(Artist            

-----

### Create disambiguation and redicrect mappings

In [None]:
phrases = load('phrases')

In [2]:
meta = load('meta')

Reading ../data/preprocessed/dewiki_metadata.pickle


In [None]:
links = load('links')

In [2]:
cat = load('categories')

Reading ../data/preprocessed/dewiki_phrases_lemmatized.pickle
Reading ../data/preprocessed/dewiki_metadata.pickle
Reading ../data/preprocessed/dewiki_links.pickle
Reading ../data/preprocessed/dewiki_categories.pickle


In [3]:
# this cell corrects a bug that occured when generating the links table (the original code has already been fixed)
linklenmask = (links.link.str.len() == 1)
links_fix = links[linklenmask]

def re_join(column):
    if column.name in {'norm', 'category', 'hash_nlp'}:
        return column.values[0]
    else:
        return ''.join(column)

links_fix = links_fix.groupby('hash_nlp', sort=False, as_index=False).agg(re_join)
links = links[~linklenmask]
links = links.append(links_fix, sort=False)
links.to_pickle(join(ETL_PATH, 'dewiki_links.pickle'))
links

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,category,hash_nlp,link,norm
0,,8952056961092092653,Pseudonym,
1,,8952056961092092653,Regisseur,
2,,8952056961092092653,Directors Guild of America,
3,,8952056961092092653,Internet Movie Database,
4,,8952056961092092653,Frank Patch – Deine Stunden sind gezählt,
5,,8952056961092092653,Robert Totten,
6,,8952056961092092653,Richard Widmark,
7,,8952056961092092653,Don Siegel,
8,Stil,8952056961092092653,Handschrift,Manier
9,,8952056961092092653,Anagramm,


In [3]:
def restore_orig_title(row):
    if row.description is None:
        return row.title
    else:
        return f'{row.title} ({row.description})'

meta_orig_title = meta[['title', 'description']].apply(restore_orig_title, axis=1)
meta_orig_title_swap = pd.Series(meta_orig_title.index.values, index=meta_orig_title)

In [56]:
import spacy
from spacy.tokens import Token
from lemmatizer_plus import LemmatizerPlus

lemmatizer_path='../data/IWNLP.Lemmatizer_20170501.json'
nlp = spacy.load('de', disable=['parser'])
lemmatizer = LemmatizerPlus(lemmatizer_path, nlp)
nlp.add_pipe(lemmatizer)

def lemmatize(title):
    if not ' ' in title:
        return title, title
    else:
        doc = nlp(title)
        text = []
        token = []
        for t in doc:
            text.append(str(t.text))
            if t._.iwnlp_lemmas is not None:
                token.append(t._.iwnlp_lemmas)
            elif t.lemma_ is not None:
                token.append(t.lemma_)
            else:
                token.append(t.text)
        return '_'.join(text), '_'.join(token)

In [None]:
# building a disambiguation map. Can be used for synonyms
disamb = meta[meta.description == 'Begriffsklärung']
disamb = links.join(disamb, on='hash_nlp', how='inner').drop(['doc_id', 'description', 'length', 'doc_subid'], axis=1)
disamb = disamb.rename(columns={'hash_nlp': 'hash_disamb_page'})
disamb = disamb[['title', 'hash_disamb_page', 'link', 'norm', 'category']]
disamb['hash_link'] = disamb.link.map(lambda x: meta_orig_title_swap[x] if x in meta_orig_title_swap.index else 0)
disamb = disamb.set_index(['title', 'hash_disamb_page'])
tmp = disamb.link.apply(lambda x: pd.Series(split_title(x)))
disamb['title'], disamb['category'] = tmp[0], tmp[1]
disamb = disamb[['title', 'category', 'hash_link']]
disamb = disamb.rename(columns={'title': 'link_to'})
disamb = disamb.reset_index(drop=False)
tmp = disamb.title.apply(lambda x: pd.Series(lemmatize(x)))
disamb['title'], disamb['token'] = tmp[0], tmp[1]
disamb = disamb.set_index(['hash_disamb_page', 'title', 'token'])
disamb['idx'] = disamb.groupby(['hash_disamb_page', 'title', 'token']).cumcount()
disamb = disamb.set_index('idx', append=True)
disamb.to_pickle(join(ETL_PATH, 'dewiki_disambiguation.pickle'))
disamb

In [None]:
# removes null values from categories
cat = cat[~cat.category.isnull()]
cat = cat.set_index('hash_nlp')
cat.to_pickle(join(ETL_PATH, 'dewiki_categories.pickle'))
load('categories')

-----

In [4]:
def collect_redirects(x):
    redir = x[x.subset == 'REDIRECT']
    redir = redir[['doc_id', 'title', 'description', 'tags']]

    def restore_title(tpl):
        if tpl[1] is None:
            return tpl[0]
        else:
            return f'{tpl[0]} ({tpl[1]})'

    redir['directs_to'] = redir.tags.map(restore_title)
    redir['hash_directs_to'] = redir.directs_to.map(lambda x: meta_orig_title_swap[x] if x in meta_orig_title_swap.index else 0)
    redir = redir.drop('tags', axis=1)
    return redir

dep_path = join(ETL_PATH, 'deprecated')
pattern = re.compile(r'dewiki_\d')
files = sorted([f for f in listdir(dep_path) if (isfile(join(dep_path, f)) and pattern.match(f))])
redirects = []
for file in files:
    gc.collect()
    f = join(dep_path, file)
    print('Reading', f)
    df = pd.read_pickle(f)
    df = collect_redirects(df)
    redirects.append(df)
    gc.collect()
redirects = pd.concat(redirects)
rtmp = redirects.title.progress_apply(lambda x: pd.Series(lemmatize(x)))
redirects.title, redirects['token'] = rtmp[0], rtmp[1]
redirects = redirects[['doc_id', 'title', 'token', 'description', 'directs_to', 'hash_directs_to']]
redirects.to_pickle(join(ETL_PATH, 'dewiki_redirects.pickle'))