In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from collections import OrderedDict

from lxml import etree
from tqdm import tqdm

import re

regex = '__________\n'


def read_book(fname):
    result = OrderedDict()
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('book'))
    for event, elem in tqdm(context):
        for e in elem.iterchildren():
            if e.tag == 'id':
                t_id = e.text
                #print(t_id)
            elif e.tag == 'nass':   # text
                # sometimes the text is None (e.g., file 16, id 214)
                text = e.text
                fn = None
                if text is None:
                    print('None text found in {} (prev? id: {})'.format(fname, t_id))
                else:
                    # &#xd; is automatically replaced by \r, but we would like to have \n
                    text = text.replace('\r', '\n')
                    
                    # get the footnotes if they are there
                    parts = re.split(regex, text)
                    
                    # we shouln't have parts longer than 2
                    if len(parts) > 2:
                        print(t_id, len(parts))
                        print(parts)
    
                    if len(parts) == 2:
                        t, fn = parts
                        text = t

            elif e.tag == 'page':
                page = e.text
            elif e.tag == 'part':
                part = e.text
            #print(e.tag)
        
        # ignore entries that don't have text
        if not text is None:
            result[t_id] = {'text': text, 'page': page, 'part': part}
        
        if not fn is None:
            result[t_id]['footnotes'] = fn

        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return result

book = read_book('/home/jvdzwaan/Downloads/book.xml')
print(len(book))

In [None]:
def read_title(fname):
    result = {}
    levels = {}
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('title'))
    for event, elem in tqdm(context):
        for e in elem.iterchildren():
            if e.tag == 'id':
                t_id = e.text
                #print(t_id)
            elif e.tag == 'tit':
                title = e.text
                # &#xd; is automatically replaced by \r, but we would like to have \n
                title = title.replace('\r', '\n')
            elif e.tag == 'lvl':
                level = e.text
                levels[level] = None
            #print(e.tag)
        result[t_id] = {'title': title, 'level': level}

        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    print('levels found:', levels.keys())
    return result, levels.keys()

title, levels = read_title('/home/jvdzwaan/Downloads/title.xml')
print(len(title))

In [None]:
# combine book and title
# and write to file
import codecs

def combine_data(book, title, out_file):
    with codecs.open(out_file, 'w', encoding='utf-8') as f:
        for b_id, data in book.items():
            text = data['text']
            if b_id in title:
                #print(b_id)
                #print('title!')
                ti = title[b_id]['title']
                lvl = title[b_id]['level']
                #print(title[b_id]['title'], title[b_id]['level'])
                if text.strip().startswith(ti):
                    #print('title in text')
                    text = text.replace(ti, '### {} {}\n'.format(int(lvl)*'|', ti))
                else:
                    text = '### {} {}\n'.format(int(lvl)*'|', ti) + text
                    #print('title text not found', b_id)
            f.write(text.strip())
            f.write('\n')
            # sometimes the page marker information is incomplete (e.g. for book 12)
            # in that case, we don't write the marker information
            if not data['part'] is None and not data['page'] is None:
                page = data['page'].split()[0]
                part = data['part'].split()[0]
                
                try:
                    part = int(part)
                except ValueError:
                    print('Invalid part {} ({})'.format(part, out_name))
                    part = 0
                    
                try:
                    page = int(page)
                except ValueError:
                    print('Invalid page {} ({})'.format(page, out_name))
                    page = 0
                
                page_marker = 'V{:03}P{:04}'.format(part, page)
                f.write(page_marker)
                f.write('\n')
            #print('---')
combine_data(book, title, 'test.txt')

In [None]:
import os
import pandas as pd

in_dir = '/home/jvdzwaan/Downloads/dawa_xml_2/New_xml_corpus/'
out_dir = '/home/jvdzwaan/data/adh-corpora/dawa/txt/'

level_data = []

for i in range(1, 217):
    out_name = '{}.txt'.format(i)
    
    d = os.path.join(in_dir, str(i))
    book_file = os.path.join(d, 'book.xml')
    title_file = os.path.join(d, 'title.xml')
    print(book_file)
    print(title_file)
    book = read_book(book_file)
    title, levels = read_title(title_file)
    
    level_data.append({'book': i, 'levels': list(levels)})
    
    out_file = os.path.join(out_dir, out_name)
    print(out_file)
    combine_data(book, title, out_file)
    print('---')
    
levels = pd.DataFrame(level_data)
levels = levels.set_index('book')
levels.head()

In [None]:
levels.to_csv('dawa_levels.csv', encoding='utf-8')

In [None]:
# get the footnotes

import re

regex = '__________\n'

for b_id, data in book.items():
    text = data['text']
    
    m = re.search(regex, text)
    #if m:
    #    print(b_id)
        
    parts = re.split(regex, text)
    if len(parts) > 2:
        print(b_id, len(parts))
    
    if len(parts) == 2:
        t, fn = parts

