# Some simple accessing and processing of Epidoc TEI

http://papyri.info/ddbdp/bgu;1;133

XML of Edition: http://papyri.info/ddbdp/bgu;1;133/source

In [1]:
"""This code is meant as a simple demo of grabbing TEI-encoded XML and doing trivial things with it."""

import sys
from lxml import etree
import urllib.request

def xml_nodes_as_text_list(elements):
    """A simple function that doesn't really do much. Can handle elements or attributes."""
    els_list = []
    for e in elements:
        try:
            e_as_text = etree.tounicode(e, method="text")
            if len(e_as_text):
                els_list.append(e_as_text)
        except TypeError:
            els_list.append(e)
            
    return els_list

def get_tei(url):
    """lxml seems not to like encoding declarations so this function removes if present."""
    f = urllib.request.urlopen(url)
    tei_as_string = f.read().decode('utf-8').replace('encoding="UTF-8"','')
    return etree.XML(tei_as_string)

namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}
tei = get_tei('http://papyri.info/ddbdp/bgu;1;133/source')

print("1. The value atrributes of the num elements:")
elements = tei.xpath("//tei:num/@value",namespaces = namespaces)

nums = xml_nodes_as_text_list(elements)
print(nums)

total = 0
for n in nums:
    total += int(n)
print("Total: " + str(total))

print("\n1. Pretty raw list of the original text of num elements:")
elements = tei.xpath("//tei:num",namespaces = namespaces)
print(xml_nodes_as_text_list(elements))

print("\n2. Original text of num elements with whitespace cleaned up:")
text_list = xml_nodes_as_text_list(elements)
for text in text_list:
    print(text.strip())

print("\n3. The document with no xml markup:")
    
elements = tei.xpath("//tei:ab",namespaces = namespaces)
text_list = xml_nodes_as_text_list(elements)
for text in text_list:
    print(text.strip())


1. The value atrributes of the num elements:
['4', '7', '104', '6', '10', '14', '100', '8']
Total: 253

1. Pretty raw list of the original text of num elements:
['δ \n\n    ', 'ζ ', 'ρδ ', 'ϛ ἄρνας ', 'ι, ἐξ ὧν ', ', \n\n    ', ' \n\n    ', 'η ']

2. Original text of num elements with whitespace cleaned up:
δ
ζ
ρδ
ϛ ἄρνας
ι, ἐξ ὧν
,

η

3. The document with no xml markup:
δ 

     Αἰλίωι Σαραπίωνι στρατηγῷ 

    Ἀρσινοΐτου Ἡρακλείδου μερίδος 

    παρὰ Σουχᾶ τοῦ Σουχᾶ 

    τοῦ Διοδώρου ἀπὸ ἀμφόδου 

    Ἑλληνίου. ἃ ἀπεγραψάμην 

    τῷ διεληλυθότι ζ ἔτει πρόβατα 

    ρδ αἶγας ϛ ἄρνας ι, ἐξ ὧν διε

    φθάρηδιεφάαθη πρόβατα δέκα τέσσερα , 

    τὰ δὲ λοιπὰ πρόβατα ἑκατὸν  

    ἀπογράφομαι καὶ εἰς τὸ ἐνεστὸς 

    η ἔτος Ἀντωνίνου Καίσαρος τοῦ κυρίου
