In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from lxml import etree
from tqdm import tqdm


def analyzer_xml2words_and_headers(fname):
    words = {}
    headers = {}
    metadata = b'<metadata></metadata>'
    nr_headers = 0
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in tqdm(context):
        if elem.tag == 'word':
            w_id = elem.attrib['w_id']
            # Setting method to html (instead of xml) fixes problems
            # with writing Arabic characters in the value attribute of
            # the word element.
            words[int(w_id)] = etree.tostring(elem, encoding='utf-8', method='html')
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
            
    del context

    # Extract the headers
    context = etree.iterparse(fname, events=('end', ), tag=('header'))
    for event, elem in tqdm(context):
        nr_headers += 1
        level = int(elem.attrib['level'])
        if level not in headers:
            headers[level] = {}
            
        header_title = elem.attrib['text']
        for ref in elem.getchildren():
            if ref.tag == 'ref':
                headers[level][int(ref.attrib['id'])] = header_title
        #if elem.tag == 'metadata':
        #    metadata = etree.tostring(elem, encoding='utf-8')
                    
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    # Extract the metadata
    context = etree.iterparse(fname, events=('end', ), tag=('metadata'))
    for event, elem in tqdm(context):
        metadata = etree.tostring(elem, encoding='utf-8')
                    
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    print('nr of headers: {}'.format(nr_headers))        
    return words, headers, metadata

In [None]:
#xml_file = '/home/jvdzwaan/data/tmp/adh/chapters/0381IbnBabawayh.Hidaya.xml'
xml_file = '/home/dafne/bridging-the-gap/data/20181009/books-stemmed/0897IbnYusufCabdariGharnati.TajWaIklilLiMukhtasarKhalil.xml'
words, headers, metadata = analyzer_xml2words_and_headers(xml_file)

In [None]:
print(len(words), len(headers))

In [None]:
headers.keys()

In [None]:
# How many headers?
len(headers[2])

In [None]:
headers[2][426]

In [None]:
headers.get(1, {}).update(headers.get(2, {}) )

In [None]:
h = headers[2]
h.update()

In [None]:
metadata_elem = etree.fromstring(metadata)
lev1_el = etree.Element('meta', attrib={'name': 'VolumeTitle'})
lev1_el.text = 'abc'
metadata_elem.append(lev1_el)
etree.tostring(metadata_elem, encoding='utf-8', pretty_print=True)

In [None]:
# To do: add chapter name / number to metadata
import os
import codecs
import shutil

from nlppln.utils import out_file_name

def write_xml(xml_out, metadata, words, analysis_tag = 'morphology_analysis', lev1_title='', lev2_title=''):
    total_words = len(words)
    with codecs.open(xml_out, 'wb') as f:
        f.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
        f.write(b'<document>\n')

        ## Add metadata
        if lev1_title=='':
            lev1_title = '-'
        if lev2_title=='':
            lev2_title = '-'   
        metadata_elem = etree.fromstring(metadata)
        metadata_elem.append(etree.fromstring('<meta name="VolumeTitle">{}</meta>'.format(lev1_title)))
        metadata_elem.append(etree.fromstring('<meta name="ChapterTitle">{}</meta>'.format(lev2_title)))
        
        f.write(etree.tostring(metadata_elem, encoding='utf-8', pretty_print=True))
        f.write(b'\n')

        tag = '<{} total_words="{}">\n'.format(analysis_tag, total_words)
        f.write(tag.encode('utf-8'))

        for w in words:
            f.write(w)

        f.write('</{}>\n'.format(analysis_tag).encode('utf-8'))

        #f.write(markers)

        f.write(b'</document>\n')

In [None]:
def get_out_file_name(doc_name, out_dir, i):
    fname = '{}-{:05}.xml'.format(doc_name, i)
    fname = out_file_name(out_dir, fname)
    return fname

In [None]:
#out_dir = '/home/jvdzwaan/data/tmp/adh/chapter-files/'
out_dir = '/home/dafne/bridging-the-gap/data/20181009/chapters/'
doc_name = os.path.splitext(os.path.basename(xml_file))[0]
header_ids = None


header_ids = list(headers.get(1, {}).keys()) + list(headers.get(2, {}).keys() )

if(len(header_ids)>0):
    # do the stuff
    print('Available headers: {}'.format(list(headers.keys())))
    text = []
    header1 = False
    header2 = False
    i = 0
    header1_name = ''
    header2_name = ''
    for wid, word in words.items():
        # Level 1 header
        if wid in headers.get(1,{}):
            if header1 == False:
                if len(text) > 0:
                    # start of new header
                    # write text to file
                    fname = get_out_file_name(doc_name, out_dir, i)
                    write_xml(fname, metadata, text, lev1_title=header1_name, lev2_title=header2_name)
                    
                    #reset
                    text = []
                    header1_name = headers[1][wid]
                    i += 1
                header1 = True
        else:
            header1 = False
        
        # Level 2 header
        if wid in headers.get(2,{}):
            if header2 == False:
                if len(text) > 0:
                    # start of new header
                    # write text to file
                    fname = get_out_file_name(doc_name, out_dir, i)
                    write_xml(fname, metadata, text, lev1_title=header1_name, lev2_title=header2_name)
                    
                    #reset
                    text = []
                    header2_name = headers[2][wid]
                    i += 1
                header2 = True
        else:
            header2 = False
        
        text.append(word)
    
    # Also write away the last chapter
    if len(text) > 0:
        # write text to file
        fname = get_out_file_name(doc_name, out_dir, i)
        write_xml(fname, metadata, text, lev1_title=header1_name, lev2_title=header2_name)
else:
    # no header information, just copy the input file
    print('No headers in', doc_name)
    fo = out_file_name(out_dir, xml_file)
    if os.path.abspath(xml_file) != fo:
        shutil.copy2(xml_file, fo)

* Geen headers: gewoon hele file outputten (0182AbuYusufYacqub.Kharaj.xml)
* Alleen header 1: file voor elke header (0373AbuLaythSamarqandi.CuyunMasail.xml)
* Alleen header 2: file voor elke header (0381IbnBabawayh.Hidaya.xml)
  * Er zou text voor de eerste header kunnen staan, wordt die goed meegenomen?
* Header 1 en header 2: (0897IbnYusufCabdariGharnati.TajWaIklilLiMukhtasarKhalil.xml)
  * Als er tekst is tussen het einde van header 1 en het begin van header 2, moet die in een aparte file worden opgeslagen, anders komt de header 1 tekst bij het volgende chapter
  * Als er tekst is voor header 1 begint, aparte file
* Level 3 of dieper: negeren we