In [42]:
import lxml.etree as ET
import hashlib
from templates import templates

In [47]:
class TeiReader():
    
    """ a class to read an process tei-documents"""
    
    def __init__(self, file):
        self.ns_tei = {'tei': "http://www.tei-c.org/ns/1.0"}
        self.ns_xml = {'xml': "http://www.w3.org/XML/1998/namespace"}
        self.file = file
        try:
            self.tree = ET.parse(file)
        except:
            None
        try:
            self.parsed_file = ET.tostring(self.tree, encoding="utf-8")
        except:
            self.parsed_file = "parsing didn't work"
    
    def create_place(self, xml_id="something", text="someplace"):
        
        """ creates a tei:place element with an @xml:id and a child element tei:placeName"""
        
        place = ET.Element("place")
        place.attrib['{http://www.w3.org/XML/1998/namespace}id'] = xml_id
        placeName = ET.Element("placeName")
        placeName.text = text
        place.append(placeName)
        return place
    
    def get_places_elements(self, ids):
        
        """ takes a list of elements with a text node and a @ref attribute and returns a tei:placeList"""
        
        places = []
        for x in ids:
            text = x['text']
            ref = x['ref'][1:]
            place = (text, ref)
            places.append(place)
        place_elements = []
        for text, ref in set(places):
            place = self.create_place(ref, text)
            place_elements.append(place)
        return place_elements
    
    def find_elements(self, tei_element='placeName'):
        
        """ parses a tei:TEI//tei:text element,
        * extracts all nodes matching tei_element,
        * and reaturns a dictionary containing
        ** the name of the searched element: 'tei_element',
        ** the number of hits: 'nr_of_hits', 
        ** and a list of the found element (as lxml element objects)
        """
        
        result = {'tei_element': tei_element}
        result['hits'] = self.tree.xpath('//tei:text//tei:{}'.format(tei_element), namespaces=self.ns_tei)
        result['nr_of_hits'] = len(result['hits'])
        return result
    
    def add_ids(self, tei_element='placeName', id_prefix='some', export=True, export_file="updated"):
        
        """ reads an tei-xml document
        * looks for tei_elements,
        * adds generic @ref (hashed text-node),
        * and returns a tuple containing 
        ** a list of elements,
        ** and the updated xml-tree object.
        """
        
        hits = self.find_elements(tei_element)['hits']
        ids = []
        for x in hits:
            if x.text is None:
                break
            if x.attrib['ref'] is not None:
                ids.append({'text': x.text, 'ref': x.attrib['ref'], 'node': x})                
            else:
                ref = hashlib.md5(x.text.encode('utf-8')).hexdigest()
                x.attrib['ref'] = "#{}_{}".format(id_prefix, ref)
                ids.append({'text': x.text, 'ref': x.attrib['ref'], 'node': x})
        if export:
            file = "{}.xml".format(export_file)
            with open(file, 'wb') as f:
                f.write(ET.tostring(self.tree, pretty_print=True,encoding="UTF-8"))
        return ids, self.tree
        
    def create_place_index(self, tei_element='placeName', id_prefix='place', export=True, export_file='index'):
        
        """ takes a list of elements and transforms them into an index-file"""
        
        nodes = self.add_ids(tei_element, id_prefix, export=False)[0]
        places = self.get_places_elements(nodes)
        list_place = ET.Element("listPlace")
        for x in places:
            list_place.append(x)
        new_doc = ET.fromstring(templates.tei_document)
        body = new_doc.xpath('//tei:body', namespaces=self.ns_tei)[0]
        body.append(list_place)
        if export:
            file = "{}.xml".format(export_file)
            with open(file, 'wb') as f:
                f.write(ET.tostring(new_doc, pretty_print=True, encoding="UTF-8"))
        
        return new_doc

In [48]:
test = TeiReader('data/output/new.xml')

In [52]:
hits = test.create_place_index(export_file='data/output/new1listplace')

In [51]:
hits

([{'node': <Element {http://www.tei-c.org/ns/1.0}placeName at 0x1f1f9a2cbc8>,
   'ref': '#some_36cb84b200c47d2c7a7e8dbbe3c1a595',
   'text': 'Wiednerth.'},
  {'node': <Element {http://www.tei-c.org/ns/1.0}placeName at 0x1f1f9a05cc8>,
   'ref': '#some_e9a0bb1cfe770648982dfc20ff12ab11',
   'text': 'Matzleinsdorferth.'},
  {'node': <Element {http://www.tei-c.org/ns/1.0}placeName at 0x1f1f9a05908>,
   'ref': '#some_273c6fe7cd0300c46e98e05cc3414e33',
   'text': 'Quaipark'},
  {'node': <Element {http://www.tei-c.org/ns/1.0}placeName at 0x1f1f9a05588>,
   'ref': '#some_d0a6343a081a7335baa965ce4fd0845c',
   'text': 'München'},
  {'node': <Element {http://www.tei-c.org/ns/1.0}placeName at 0x1f1f9a05ac8>,
   'ref': '#some_b69137a4d7f1bc5b9fa27e655151af34',
   'text': 'P.'},
  {'node': <Element {http://www.tei-c.org/ns/1.0}placeName at 0x1f1f9a069c8>,
   'ref': '#some_08222c6cb47514f7f5957bb681b5469d',
   'text': 'R.park'},
  {'node': <Element {http://www.tei-c.org/ns/1.0}placeName at 0x1f1f9a062