In [1]:
%matplotlib inline
from constants import *
from os import listdir
from os.path import isfile, join
import gc
import matplotlib.pyplot as plt
import pandas as pd
import re
from itertools import islice
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
pd.options.display.max_rows = 2001

-----

#### original data

In [None]:
pattern = re.compile(r'dewiki_\d')
files = sorted([f for f in listdir(ETL_PATH)
                if (isfile(join(ETL_PATH, f)) and pattern.match(f))])

metadata = []
for name in files[:]:
    gc.collect()
    corpus = name.split('.')[0]
    subid = float(corpus.split('_', 1)[1].replace('_', '.'))
    print(corpus, subid, end=', ')
    f = join(ETL_PATH, corpus + '.pickle')
    df = pd.read_pickle(f)
    df[ID2] = subid
    df['length'] = df.text.apply(len)
    df = df.loc[df.subset == 'ARTICLE', [ID, TITLE, DESCR, 'length', ID2]]
    metadata.append(df.copy())
    
df = pd.concat(metadata)

In [None]:
#print(df.dtypes)
df.to_pickle(join(ETL_PATH, 'dewiki_metadata.pickle'))

In [2]:
df = pd.read_pickle(join(ETL_PATH, 'dewiki_metadata.pickle'))
df = df.reset_index().set_index(ID)
print(len(df), df.index.dtype, '\n', df.dtypes)
df.head()

2215487 int64 
 hash_nlp         int64
title           object
description     object
length           int64
doc_subid      float64
dtype: object


Unnamed: 0_level_0,hash_nlp,title,description,length,doc_subid
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,8952056961092092653,Alan Smithee,,4840,1.0
3,598046625986755870,Actinium,,5554,1.0
5,8442369265370766621,Ang Lee,,12869,1.0
7,-5325279570187525080,Anschluss,Soziologie,2600,1.0
10,5107548614255273253,Aussagenlogik,,51077,1.0


-----

#### additional metadata

In [None]:
dpath = join(ETL_PATH, 'deprecated')
pattern = re.compile(r'^dewiki_[0-3]')
files = sorted([f for f in listdir(dpath) if pattern.match(f)])
dfs = []
for file in files:
    gc.collect()
    print(file)
    df = pd.read_pickle(join(dpath, file)).drop(['text', 'title', 'description', 'doc_subid'], axis=1).copy()
    dfs.append(df)
    gc.collect()

df = pd.concat(dfs)
del dfs
gc.collect()
gc.collect()
meta = load('meta')
assert df.doc_id.count() == df.doc_id.nunique()
assert meta.doc_id.count() == meta.doc_id.nunique()
df = df[df.doc_id.isin(meta.doc_id)]
del meta
gc.collect()
df.to_pickle(join(ETL_PATH, 'dewiki_metadata_additional.pickle'))

-----

#### new data

In [3]:
input_dir = join(DATA_BASE, 'processed_documents/docs/AA')
files = sorted(listdir(input_dir))
re_id = re.compile('id="(.*?)"')
re_url = re.compile('url="(.*?)"')
re_title = re.compile('title="(.*?)"')

docs = []
for file in files:
    fpath = join(input_dir, file)
    with open(fpath, "r", encoding="utf-8") as fp:
        print('open', fpath)
        doc = dict()
        docopen = False
        for line in islice(fp.readlines(), None):
            line = line.strip()

            if not docopen and line[:4] == '<doc':
                id_ = re_id.search(line).group(1)
                url = re_url.search(line).group(1)
                title = re_title.search(line).group(1)
                docopen = True
                doc['id'] = int(id_)
                doc['title'] = title
                doc['text'] = []
                
            elif docopen and line[:5] == '</doc':
                docs.append(doc)
                doc = dict()
                docopen = False
            
            elif line and docopen:
                doc['text'].append(line)

open ../data/processed_documents/docs/AA/wiki_00
open ../data/processed_documents/docs/AA/wiki_01
open ../data/processed_documents/docs/AA/wiki_02
open ../data/processed_documents/docs/AA/wiki_03
open ../data/processed_documents/docs/AA/wiki_04
open ../data/processed_documents/docs/AA/wiki_05
open ../data/processed_documents/docs/AA/wiki_06
open ../data/processed_documents/docs/AA/wiki_07
open ../data/processed_documents/docs/AA/wiki_08
open ../data/processed_documents/docs/AA/wiki_09
open ../data/processed_documents/docs/AA/wiki_10


In [5]:
dfn = pd.DataFrame.from_records(docs)
dfn = dfn.set_index('id')
dfn[TEXT] = dfn.text.apply(lambda x: '\n'.join(x[1:]))
print(len(dfn), dfn.index.dtype, '\n', dfn.dtypes)
dfn.head()

2215487 int64 
 text     object
title    object
dtype: object


Unnamed: 0_level_0,text,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Alan Smithee steht als Pseudonym für einen fik...,Alan Smithee
3,Actinium ist ein radioaktives chemisches Eleme...,Actinium
5,"Ang Lee (; * 23. Oktober 1954 in Chaozhou, Tai...",Ang Lee
7,Anschluss ist in der Soziologie ein Fachbegrif...,Anschluss (Soziologie)
10,Die Aussagenlogik ist ein Teilgebiet der Logik...,Aussagenlogik


-----

#### join data

In [6]:
dfx = dfn.join(df, rsuffix='_old')
#dfx['length_new'] = dfx.text.apply(len)
#dfx['length_diff'] = (dfx.length - dfx.length_new).abs()
#dfx['title_diff'] = dfx.title == dfx.title_old
#dfx = dfx.drop(['title_diff', 'length_diff', 'length_new'], axis=1)
dfx = dfx.drop(['length', 'title'], axis=1)
dfx = dfx.reset_index()
dfx = dfx.set_index('hash_nlp')
dfx = dfx.rename(columns={'title_old': 'title', 'id': 'doc_id'})
dfx = dfx.copy()

print(len(dfx), dfx.index.dtype, '\n', dfx.dtypes)
dfx.head()

2215487 int64 
 doc_id           int64
text            object
title           object
description     object
doc_subid      float64
dtype: object


Unnamed: 0_level_0,doc_id,text,title,description,doc_subid
hash_nlp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8952056961092092653,1,Alan Smithee steht als Pseudonym für einen fik...,Alan Smithee,,1.0
598046625986755870,3,Actinium ist ein radioaktives chemisches Eleme...,Actinium,,1.0
8442369265370766621,5,"Ang Lee (; * 23. Oktober 1954 in Chaozhou, Tai...",Ang Lee,,1.0
-5325279570187525080,7,Anschluss ist in der Soziologie ein Fachbegrif...,Anschluss,Soziologie,1.0
5107548614255273253,10,Die Aussagenlogik ist ein Teilgebiet der Logik...,Aussagenlogik,,1.0


In [7]:
del docs, df, dfn
gc.collect()
dfx.to_pickle(join(ETL_PATH, 'dewiki.pickle'))

In [8]:
df = pd.read_pickle(join(ETL_PATH, 'dewiki.pickle'))
print(len(df), df.index.dtype, '\n', df.dtypes)
df

2215487 int64 
 doc_id           int64
text            object
title           object
description     object
doc_subid      float64
dtype: object


Unnamed: 0_level_0,doc_id,text,title,description,doc_subid
hash_nlp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8952056961092092653,1,Alan Smithee steht als Pseudonym für einen fik...,Alan Smithee,,1.0
598046625986755870,3,Actinium ist ein radioaktives chemisches Eleme...,Actinium,,1.0
8442369265370766621,5,"Ang Lee (; * 23. Oktober 1954 in Chaozhou, Tai...",Ang Lee,,1.0
-5325279570187525080,7,Anschluss ist in der Soziologie ein Fachbegrif...,Anschluss,Soziologie,1.0
5107548614255273253,10,Die Aussagenlogik ist ein Teilgebiet der Logik...,Aussagenlogik,,1.0
1590387679453058251,13,,Liste von Autoren/A,,1.0
-8061163938924218197,14,,Liste von Autoren/H,,1.0
-7105424327842951596,15,,Liste von Autoren/C,,1.0
-3622238202242836338,16,,Liste von Autoren/I,,1.0
-8615994372994386619,17,,Liste von Autoren/K,,1.0
