In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from lxml import etree
from tqdm import tqdm


def analyzer_xml2words_and_headers(fname):
    words = {}
    headers = {}
    metadata = b'<metadata></metadata>'
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in tqdm(context):
        if elem.tag == 'word':
            w_id = elem.attrib['w_id']
            # Setting method to html (instead of xml) fixes problems
            # with writing Arabic characters in the value attribute of
            # the word element.
            words[int(w_id)] = etree.tostring(elem, encoding='utf-8', method='html')
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
            
    del context

    # Extract the headers
    context = etree.iterparse(fname, events=('end', ), tag=('header'))
    for event, elem in tqdm(context):
        level = int(elem.attrib['level'])
        if level not in headers:
            headers[level] = []
            
        for ref in elem.getchildren():
            if ref.tag == 'ref':
                headers[level].append(int(ref.attrib['id']))
        #if elem.tag == 'metadata':
        #    metadata = etree.tostring(elem, encoding='utf-8')
                    
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    # Extract the metadata
    context = etree.iterparse(fname, events=('end', ), tag=('metadata'))
    for event, elem in tqdm(context):
        metadata = etree.tostring(elem, encoding='utf-8')
                    
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
            
    return words, headers, metadata

In [None]:
xml_file = '/home/jvdzwaan/data/tmp/adh/chapters/0381IbnBabawayh.Hidaya.xml'

words, headers, metadata = analyzer_xml2words_and_headers(xml_file)

In [None]:
print(len(words), len(headers))

In [None]:
headers.keys()

In [None]:
words.keys()

In [None]:
4750 in headers[2]

In [None]:
import os
import codecs
import shutil

from nlppln.utils import out_file_name

def write_xml(xml_out, metadata, words, analysis_tag = 'morphology_analysis'):
    total_words = len(words)
    
    with codecs.open(xml_out, 'wb') as f:
        f.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
        f.write(b'<document>\n')

        f.write(metadata)
        f.write(b'\n')

        tag = '<{} total_words="{}">\n'.format(analysis_tag, total_words)
        f.write(tag.encode('utf-8'))

        for w in words:
            f.write(w)

        f.write('</{}>\n'.format(analysis_tag).encode('utf-8'))

        #f.write(markers)

        f.write(b'</document>\n')

text = []
header = False
i = 0

out_dir = '/home/jvdzwaan/data/tmp/adh/chapter-files/'
doc_name = os.path.splitext(os.path.basename(xml_file))[0]
header_ids = None

# no header information, just copy the input file
if 1 not in headers and 2 not in headers:
    print('No headers in', doc_name)
    fo = out_file_name(out_dir, xml_file)
    print(fo)
    if os.path.abspath(xml_file) != fo:
        shutil.copy2(xml_file, fo)

# only header 1
if 1 in headers and 2 not in headers:
    print('Only header 1 in', doc_name)
    header_ids = headers[1]

# only header 2
if 2 in headers and 1 not in headers:
    print('Only header 2 in', doc_name)
    header_ids = headers[2]

# header 1 and header 2
if 1 in headers and 2 in headers:
    print('Header 1 and 2 in', doc_name)
    header_ids = headers[1] + headers[2]
    
if header_ids is not None:
    for wid, word in words.items():
        if wid in headers[2]:
            if header == False:
                if len(text) > 0:
                    # start of new header
                    # write text to file
                    fname = '{}-{:05}.xml'.format(doc_name, i)
                    fname = out_file_name(out_dir, fname)
                    write_xml(fname, metadata, text)
                    #print(len(text))
                    text = []
                    i += 1
                header = True
        else:
            header = False
        text.append(word)
    
#xml_out = out_file_name(out_dir, in_file)


* Geen headers: gewoon hele file outputten (0182AbuYusufYacqub.Kharaj.xml)
* Alleen header 1: file voor elke header (0373AbuLaythSamarqandi.CuyunMasail.xml)
* Alleen header 2: file voor elke header (0381IbnBabawayh.Hidaya.xml)
  * Er zou text voor de eerste header kunnen staan, wordt die goed meegenomen?
* Header 1 en header 2: (0897IbnYusufCabdariGharnati.TajWaIklilLiMukhtasarKhalil.xml)
  * Als er tekst is tussen het einde van header 1 en het begin van header 2, moet die in een aparte file worden opgeslagen, anders komt de header 1 tekst bij het volgende chapter
  * Als er tekst is voor header 1 begint, aparte file
  