In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from collections import OrderedDict

from lxml import etree
from tqdm import tqdm


def read_book(fname):
    result = OrderedDict()
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('book'))
    for event, elem in tqdm(context):
        for e in elem.iterchildren():
            if e.tag == 'id':
                t_id = e.text
                #print(t_id)
            elif e.tag == 'nass':   # text
                text = e.text
                # &#xd; is automatically replaced by \r, but we would like to have \n
                text = text.replace('\r', '\n')
            elif e.tag == 'page':
                page = e.text
            elif e.tag == 'part':
                part = e.text
            #print(e.tag)
        result[t_id] = {'text': text, 'page': page, 'part': part}

        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return result

book = read_book('/home/jvdzwaan/Downloads/book.xml')
print(len(book))

In [None]:
def read_title(fname):
    result = {}
    levels = {}
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('title'))
    for event, elem in tqdm(context):
        for e in elem.iterchildren():
            if e.tag == 'id':
                t_id = e.text
                #print(t_id)
            elif e.tag == 'tit':
                title = e.text
                # &#xd; is automatically replaced by \r, but we would like to have \n
                title = title.replace('\r', '\n')
            elif e.tag == 'lvl':
                level = e.text
                levels[level] = None
            #print(e.tag)
        result[t_id] = {'title': title, 'level': level}

        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    print('levels found:', levels.keys())
    return result

title = read_title('/home/jvdzwaan/Downloads/title.xml')
print(len(title))

In [None]:
# combine book and title

for b_id, data in book.items():
    text = data['text']
    if b_id in title:
        #print(b_id)
        #print('title!')
        ti = title[b_id]['title']
        lvl = title[b_id]['level']
        #print(title[b_id]['title'], title[b_id]['level'])
        if text.strip().startswith(ti):
            #print('title in text')
            text = text.replace(ti, '### {} {}\n'.format(int(lvl)*'|', ti))
        else:
            text = '### {} {}\n'.format(int(lvl)*'|', ti) + text
    print(text)
    page_marker = 'V{:03}P{:04}'.format(int(data['part']), int(data['page']))
    print(page_marker)
    print('---')