# Dewiki dataset overview

Get an overview over all different Wikipedia files without loading them.

Note, that the problem with differeing hash ids has already been solved.

In [1]:
%matplotlib inline
from collections import defaultdict
from constants import *
from os import listdir
from os.path import isfile, join
import gc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
pd.options.display.max_rows = 500

In [2]:
dewiki_categories = 'dewiki_categories.pickle'
dewiki_good_ids = 'dewiki_good_ids.pickle'
dewiki_hashmap = 'dewiki_hashmap.pickle'
dewiki_links = 'dewiki_links.pickle'
dewiki_metadata = 'dewiki_metadata.pickle'
dewiki_new = 'dewiki_new.pickle'
dewiki_phrases = 'dewiki_phrases.pickle'
dewiki_phrases_joined = 'dewiki_phrases_joined.pickle'
dewiki_phrases_lemmatized = 'dewiki_phrases_lemmatized.pickle'

df_dewiki_categories = pd.read_pickle(join(ETL_PATH, dewiki_categories))
df_dewiki_good_ids = pd.read_pickle(join(ETL_PATH, dewiki_good_ids))
df_dewiki_hashmap = pd.read_pickle(join(ETL_PATH, dewiki_hashmap))
df_dewiki_links = pd.read_pickle(join(ETL_PATH, dewiki_links))
df_dewiki_metadata = pd.read_pickle(join(ETL_PATH, dewiki_metadata))
df_dewiki_new = pd.read_pickle(join(ETL_PATH, dewiki_new))
df_dewiki_phrases = pd.read_pickle(join(ETL_PATH, dewiki_phrases))
df_dewiki_phrases_joined = pd.read_pickle(join(ETL_PATH, dewiki_phrases_joined))
df_dewiki_phrases_lemmatized = pd.read_pickle(join(ETL_PATH, dewiki_phrases_lemmatized))

In [3]:
# extracted categories from Wikipedia article. Find the corresponding document via hash_nlp.
df_dewiki_categories

Unnamed: 0,category,hash_nlp
0,Fiktive Person,8952056961092092653
1,Pseudonym,8952056961092092653
2,Sammelpseudonym,8952056961092092653
3,Werk von Alan Smithee,8952056961092092653
4,Ang Lee,8442369265370766621
5,Drehbuchautor,8442369265370766621
6,Filmregisseur,8442369265370766621
7,Oscarpreisträger,8442369265370766621
8,Namensgeber für einen Asteroiden,8442369265370766621
9,Mitglied der American Academy of Arts and Scie...,8442369265370766621


In [4]:
# Some articles have been discarded since they were disambiguation pages, redirects, lists 
# or they had an uncommon word distribution (e.g. a japanese title)
# This series contains the hash ids from articles that we deem valid.
# Note that we pose further constraints on label candidates (title length < 5, document length > 40) 
# which are not included in this list.
# The series' values are the original, not normalized Wikipedia titles.
df_dewiki_good_ids

hash_nlp
 8952056961092092653                                         Alan Smithee
 598046625986755870                                              Actinium
 8442369265370766621                                              Ang Lee
-5325279570187525080                                            Anschluss
 5107548614255273253                                        Aussagenlogik
-6810310479569543740                                    Anthony Minghella
-291419119128528545                                US-amerikanischer Film
-686601136003585762                             Vorsätze für Maßeinheiten
-928617659304474122                         Abkürzungen/Gesetze und Recht
-4698193686953049209    Liste von Unternehmen mit Namensherkunftserklä...
-1053298580323217908                                             ISO 4217
-2937308267488324601                                         Achsensprung
 1251422564758405786                                     Alfred Hitchcock
-5850162359302794440         

In [5]:
# Since Python's native hash function is not deterministic, we were confronted with differing hash ids in a subsequent
# versions of the corpus (therefore you find some datasets with a column title 'hash_nlp' instead of just 'hash' to 
# make clear to which hash-generation the set belongs to.
# This Series provides a mapping between these to generations. In the meantine this problem is fixed for all datasets 
# and only the second column is valid.
df_dewiki_hashmap

hash
 8071923686821298358    8952056961092092653
 5678176573632095564     598046625986755870
 7033279504381600369    8442369265370766621
-4350259794544444242   -5325279570187525080
-4564559512166538170    6198219068206011602
-2303248065612086942    5107548614255273253
 5531540912683666157   -2101470387989585062
-6716922945142671595   -2301747401718633066
-7770327414723038136    1590387679453058251
 3971016231991622804   -8061163938924218197
 2697634987118638512   -7105424327842951596
-20905791464122541     -3622238202242836338
 6143270201102907714   -8615994372994386619
 4206864570771268130    3227898248457871805
-5056945752789577607    8663527490855367507
 1427027150113426887   -8910893347690792076
 6156002186407445550    1212817603137507389
-5000009069028732226    5478328227396590687
 5090378819219657719    6899587403190626614
 8550880302074932780   -2104839605768432909
-879822152255749465       77918429177252947
 7824017624818634074   -6414269038722096799
 4809512197660073610   -906

In [6]:
# extracted links from Wikipedia article. Find the corresponding document via hash_nlp.
# column link: original text
# column norm: normalized version of the link which was taken directly from the arcticle
# column category: if the link (original or normalized) was followed by paranthesis this indicates a category
#                  and has been extracted here and removed from the link
df_dewiki_links

Unnamed: 0,link,norm,category,hash_nlp
0,Pseudonym,,,8952056961092092653
1,Regisseur,,,8952056961092092653
2,Directors Guild of America,,,8952056961092092653
3,Internet Movie Database,,,8952056961092092653
4,Frank Patch – Deine Stunden sind gezählt,,,8952056961092092653
5,Robert Totten,,,8952056961092092653
6,Richard Widmark,,,8952056961092092653
7,Don Siegel,,,8952056961092092653
8,Handschrift,Manier,Stil,8952056961092092653
9,Anagramm,,,8952056961092092653


In [7]:
# If in need of just the meta-data without loading the full corpus this table comes in handy.
# It also provides additional information such as number of characters (column length).
# For concatenated, lemmatized versions of the title or the length of title/number of tokens,
# refere to dewiki_phrases_lemmatized.
df_dewiki_metadata

Unnamed: 0_level_0,doc_id,title,description,length,doc_subid
hash_nlp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8952056961092092653,1,Alan Smithee,,4840,1.0
598046625986755870,3,Actinium,,5554,1.0
8442369265370766621,5,Ang Lee,,12869,1.0
-5325279570187525080,7,Anschluss,Soziologie,2600,1.0
5107548614255273253,10,Aussagenlogik,,51077,1.0
1590387679453058251,13,Liste von Autoren/A,,15240,1.0
-8061163938924218197,14,Liste von Autoren/H,,11228,1.0
-7105424327842951596,15,Liste von Autoren/C,,9861,1.0
-3622238202242836338,16,Liste von Autoren/I,,4250,1.0
-8615994372994386619,17,Liste von Autoren/K,,7332,1.0


In [8]:
# The full Wikipedia dataset prior to the NLP pipeline and without any additional preprocessing.
# The corpus was extracted using Wikiextractor, which delivered better results (less artifacts) than the 
# original extraction (hence the 'new' suffix). The original extraction however was not in vain, since it
# provided the additional extraction of links and categories.
df_dewiki_new

Unnamed: 0_level_0,doc_id,text,title,description,doc_subid
hash_nlp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8952056961092092653,1,Alan Smithee steht als Pseudonym für einen fik...,Alan Smithee,,1.0
598046625986755870,3,Actinium ist ein radioaktives chemisches Eleme...,Actinium,,1.0
8442369265370766621,5,"Ang Lee (; * 23. Oktober 1954 in Chaozhou, Tai...",Ang Lee,,1.0
-5325279570187525080,7,Anschluss ist in der Soziologie ein Fachbegrif...,Anschluss,Soziologie,1.0
5107548614255273253,10,Die Aussagenlogik ist ein Teilgebiet der Logik...,Aussagenlogik,,1.0
1590387679453058251,13,,Liste von Autoren/A,,1.0
-8061163938924218197,14,,Liste von Autoren/H,,1.0
-7105424327842951596,15,,Liste von Autoren/C,,1.0
-3622238202242836338,16,,Liste von Autoren/I,,1.0
-8615994372994386619,17,,Liste von Autoren/K,,1.0


In [9]:
# phrases from Wikipedia titles with up to 5 tokens, not lemmatized.
# Also the mapping to the document hash is lost.
#
# deprecated
df_dewiki_phrases

Unnamed: 0,0,1,2,3,4
25087,0.,Dynastie,,,
25088,0.,Klavierkonzert,,,
25089,00,Schneider,,,
25092,007,James,Bond,greift,ein
25100,007,–,Alles,oder,Nichts
25101,007:,Nightfire,,,
25102,00Sex,am,Wolfgangsee,,
25103,01,Strings,,,
25107,030,Magazin,Berlin,,
25109,07,Vestur,,,


In [11]:
# like the above, but phrase tokens are now joined.
#
# deprecated
df_dewiki_phrases_joined

25087                                            0._Dynastie
25088                                      0._Klavierkonzert
25089                                           00_Schneider
25092                              007_James_Bond_greift_ein
25100                                007_–_Alles_oder_Nichts
25101                                         007:_Nightfire
25102                                   00Sex_am_Wolfgangsee
25103                                             01_Strings
25107                                     030_Magazin_Berlin
25109                                              07_Vestur
25111                                         0711_/_Cycling
25112                                     0711_Entertainment
25113                                         07th_Expansion
25115                           08/15_(Redewendung)#Herkunft
25116                                     08/15_Zweiter_Teil
25117                                    08/15_in_der_Heimat
25120                   

In [10]:
# A more suffisticated way of creating title phrases. It is also useful as a mapping from lemmatized phrases 
# (which are likely to be less readable) to the original titles.
# doc_len (number of words) and title_len are needed for filtering irrelevant labels when generating label candidates.
df_dewiki_phrases_lemmatized

Unnamed: 0_level_0,sent_idx,text,token,doc_len,title_len
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8952056961092092653,1,Alan_Smithee,Alan_Smithee,747,2
598046625986755870,44,Actinium,Actinium,866,1
8442369265370766621,98,Ang_Lee,ANG_Lee,1632,2
-5325279570187525080,181,Anschluss,Anschluss,413,1
5107548614255273253,199,Aussagenlogik,Aussagenlogik,7211,1
-6810310479569543740,599,Anthony_Minghella,Anthony_Minghella,637,2
-291419119128528545,636,US-amerikanischer_Film,us-amerikanisch_Film,2992,2
-686601136003585762,767,Vorsätze_für_Maßeinheiten,Vorsatz_für_Maßeinheit,992,3
-928617659304474122,829,Abkürzungen/Gesetze_und_Recht,Abkürzungen/Gesetz_und_Recht,167,3
-4698193686953049209,842,Liste_von_Unternehmen_mit_Namensherkunftserklä...,Liste_von_Unternehmen_mit_Namensherkunftserklä...,42,5


In [3]:
dewiki_new_0_49999_nlp = 'dewiki_new_0_49999_nlp.pickle'
dewiki_new_0_49999__10000_simple = 'dewiki/dewiki_new_0_49999__10000_simple.pickle'
dewiki_new_0_49999__10000_simple_cache = 'dewiki/cache/dewiki_new_0_49999__10000_simple_cache.pickle'

df_dewiki_new_0_49999_nlp = pd.read_pickle(join(NLP_PATH, dewiki_new_0_49999_nlp))
df_dewiki_new_0_49999__10000_simple = pd.read_pickle(join(SMPL_PATH, dewiki_new_0_49999__10000_simple))
df_dewiki_new_0_49999__10000_simple_cache = pd.read_pickle(join(SMPL_PATH, dewiki_new_0_49999__10000_simple_cache))

In [4]:
# This is the tokenized representation of the corpus after the NLP pipeline.
# hash:        document hash id
# tok_idx:     index of a token (document scope).
# sent_idx:    index of sentence (file scope).
# text:        original token
# token:       lemmatized version (or original if lemmatization failed)
# POS:         universal tagset
# ent_iob:     indicating belonging to a named entity: B->beginning, I->inside, O->outside (no NE)
# ent_type:    so far unused
# noun_phrase: index of spacy noun_chunk (file scope).
# 
# The files have been split since the whole corpus would be problematic to load into memory or to process at once.
# The filename suffix indicates at which number of documents the split has been performed.
# Albeit, documents not in 'dewiki_good_ids' are not included since they were discarded prior to the NLP pipeline.
#
# It is noteworthy that the result of the spacy/IWNLP pipeline is not perfect with respect to tokenization, 
# lemmatization, POS-tagging and noun_phrase detection.
#
df_dewiki_new_0_49999_nlp

Unnamed: 0,hash,tok_idx,sent_idx,text,token,POS,ent_iob,ent_idx,ent_type,noun_phrase
0,8952056961092092653,0,1,Alan,Alan,PROPN,B,1,PER,1
1,8952056961092092653,1,1,Smithee,Smithee,PROPN,I,1,PER,1
2,8952056961092092653,2,1,\n,,SPACE,I,1,PER,0
3,8952056961092092653,3,2,Alan,Alan,PROPN,B,2,PER,2
4,8952056961092092653,4,2,Smithee,Smithee,PROPN,I,2,PER,2
5,8952056961092092653,5,2,steht,stehen,VERB,O,0,,0
6,8952056961092092653,6,2,als,als,ADP,O,0,,0
7,8952056961092092653,7,2,Pseudonym,Pseudonym,NOUN,O,0,,3
8,8952056961092092653,8,2,für,für,ADP,O,0,,0
9,8952056961092092653,9,2,einen,ein,DET,O,0,,4


In [5]:
# The suffix 'simple' refers to a simplified and further processed version of the raw NLP-corpus.
# Beside the removal of unnecessary columns, it most importantly replaces single tokens with phrases
# if adjacent tokens form an entity, a noun phrase, a street or a Wikipedia title.
# 
# For memory efficiency in later doc2vec and word2vec trainings the 'simple' files have been even further 
# split compared to their 'nlp' source files.
df_dewiki_new_0_49999__10000_simple

Unnamed: 0,hash,POS,sent_idx,tok_idx,token
0,8952056961092092653,NPHRASE,1,0,Alan_Smithee
1,8952056961092092653,NPHRASE,2,3,Alan_Smithee
2,8952056961092092653,VERB,2,5,stehen
3,8952056961092092653,ADP,2,6,als
4,8952056961092092653,NOUN,2,7,Pseudonym
5,8952056961092092653,ADP,2,8,für
6,8952056961092092653,DET,2,9,ein
7,8952056961092092653,ADJ,2,10,fiktiv
8,8952056961092092653,NOUN,2,11,Regisseur
9,8952056961092092653,PUNCT,2,12,","


In [6]:
# A cached verion of the simple files for word2vec training (list of tuples). Each tuple represents a lemmatized sentence.
df_dewiki_new_0_49999__10000_simple_cache

hash                  sent_idx
 8952056961092092653  1                                             (Alan_Smithee,)
                      2           (Alan_Smithee, stehen, als, Pseudonym, für, ei...
                      3           (von, 1968, bis, 2000, werden, es, von, der, D...
                      4           (Alan_Smithee, sein, jedoch, weiterhin, in, Ge...
                      5           (Alternative, Schreibweise, sein, unter, ander...
                      6                         (Alan, Smythee, und, Adam, Smithee)
                      7           (auch, zwei, teilweise, asiatisch, anmutend, S...
                      8           (Alan_Smi_Thee, und, Sumishii, Aran, gehören, ...
                      9           (Das, Pseudonym, entstehen, 1968, infolge, der...
                      10                              (Deine, Stunde, sein, zählen)
                      11          (Regisseur, Robert, Totten, und, Hauptdarstell...
                      12          (Der_Film, 