In [1]:
import glob
import lxml.etree as ET
from django.core.exceptions import ObjectDoesNotExist
from teimporter.teimodule.tei import TeiPersonList
from apis_core.apis_metainfo.models import Collection as ACollection
from apis_core.apis_entities.models import Person as APerson
from apis_core.apis_entities.models import Place as APlace
from apis_core.apis_entities.models import Work as AWork

In [2]:
data_dir = r"C:\Users\pandorfer\Documents\Redmine\konde\schnitzler-tagebuch\data\editions\*.xml"

In [3]:
files = glob.glob(data_dir)
len(files)

16405

In [17]:
class StbParser(TeiPersonList):

    """ a class to process schnitzler-diary TEIs """
    
    def get_titles(self):
        """returns title nodes in tei:titleStmt """
        expr = "//tei:titleStmt//tei:title/text()"
        nodes = self.tree.xpath(expr, namespaces=self.nsmap)
        return nodes
    
    def get_title_str(self):
        """ returns a title string """
        titles = self.get_titles()
        return "{}, {}".format(titles[1], titles[0])
    
    def mentioned_pers(self, uri_base="https://dse.hephaistos.arz.oeaw.ac.at/exist/apps/schnitzler-process/"):
        persons = self.process_listperson()
        return ["{}{}".format(uri_base, x['xml_id']) for x in persons]
    
    def get_idno(self):
        expr = "//tei:publicationStmt/tei:idno[@type='URI']/text()"
        idno = self.tree.xpath(expr, namespaces=self.nsmap)
        return "{}".format(idno[0])
    
    def get_date(self):
        expr = "//tei:body//tei:date[@when]/@when"
        try:
            node = "{}".format(self.tree.xpath(expr, namespaces=self.nsmap)[0])
        except IndexError:
            node = None
        return node
    
    def get_written_date(self):
        expr = "//tei:body//tei:date[@when]/text()"
        node = self.tree.xpath(expr, namespaces=self.nsmap)
        return "{}".format(node[0])
    

In [18]:
col, _ = ACollection.objects.get_or_create(name='STB')
text_type, _ = TextType.objects.get_or_create(name="xml/tei transcription")
rel_type, _ = PersonWorkRelation.objects.get_or_create(
        name="mentioned in", name_reverse="mentiones"
    )
work_kind, _ = WorkType.objects.get_or_create(
    name='diary'
)

In [19]:
# for x in files[:50]:
#     doc = StbParser(x)
#     print(doc.get_written_date())

In [20]:
for x in files:
    doc = StbParser(x)
    start_date = doc.get_date()
    if start_date:
        start_date_written = doc.get_written_date()
    else:
        start_date_written = None
    work, _ = AWork.objects.get_or_create(name=doc.get_title_str())
    work.start_date = doc.get_title_str()[-10:]
    work.end_date = doc.get_title_str()[-10:]
    Uri.objects.get_or_create(uri=doc.get_idno(), entity=work)
    text, _ = Text.objects.get_or_create(
        text=doc.xml_to_str(), kind=text_type
    )
    work.kind = work_kind
    if start_date:
        work.start_date_written = start_date_written
        work.end_date_written = start_date_written
        work.start_date = start_date
        work.end_date = start_date
    work.save(parse_dates=False)
    work.text.add(text)
    work.collection.add(col)
    for x in doc.mentioned_pers():
        try:
            pers_uri = Uri.objects.get(uri=x)
        except:
            pers_uri = None
        if pers_uri:
            pers = APerson.objects.get(id=pers_uri.entity.id)
            pw, _ = PersonWork.objects.get_or_create(
                related_person=pers,
                related_work = work,
                relation_type=rel_type,
            )
            if start_date:
                pw.start_date_written = start_date_written
                pw.end_date_written = start_date_written
                pw.start_date = start_date
                pw.end_date = start_date
            pw.save(parse_dates=False)
        else:
            pass

In [None]:
print(pers_uri)

In [None]:
# import started 10:32 - end about 14:30