In [1]:
import glob
import os
import lxml.etree as ET
from django.core.exceptions import ObjectDoesNotExist
from teimporter.teimodule.tei import TeiPersonList
from apis_core.apis_metainfo.models import Collection as ACollection
from apis_core.apis_entities.models import Person as APerson
from apis_core.apis_entities.models import Place as APlace
from apis_core.apis_entities.models import Work as AWork
from apis_core.helper_functions.RDFparsers import GenericRDFParser
import pandas as pd

In [2]:
data_dir = r"C:\Users\pandorfer\Documents\Redmine\konde\Hermann-Bahr_Arthur-Schnitzler\app\data\letters\*\*.xml"

In [3]:
files = glob.glob(data_dir)
len(files)

697

In [4]:
class ABWSParser(TeiPersonList):

    """ a class to process schnitzler-diary TEIs """
    
    def get_titles(self):
        """returns title nodes in tei:titleStmt """
        expr = "//tei:titleStmt//tei:title/text()"
        nodes = self.tree.xpath(expr, namespaces=self.nsmap)
        return nodes
    
    def get_title_str(self):
        """ returns a title string """
        titles = self.get_titles()
        return "{}".format(titles[1])
    
    def mentioned_pers(self, uri_base="https://bahrschnitzler.acdh.oeaw.ac.at/"):
        expr = "//tei:persName/@key"
        nodes = self.tree.xpath(expr, namespaces=self.nsmap)
        return ["{}{}".format(uri_base, x) for x in nodes]
    
    def get_idno(self):
        expr = "//tei:publicationStmt/tei:idno[@type='URI']/text()"
        idno = self.tree.xpath(expr, namespaces=self.nsmap)
        return "{}".format(idno[0])
    
    def get_date(self):
        expr = "//tei:date[@when]/@when"
        date = self.tree.xpath(expr, namespaces=self.nsmap)[0]
        return date
    
    def get_written_date(self):
        expr = "//tei:date[@when]/text()"
        date = self.tree.xpath(expr, namespaces=self.nsmap)[0]
        return date
    
    

In [5]:
# for x in files:
#     print(os.path.split(x)[1])
#     doc = ABWSParser(x)
#     print(doc.get_date())

In [6]:
col, _ = ACollection.objects.get_or_create(name='HBAS')
text_type, _ = TextType.objects.get_or_create(name="xml/tei transcription")
rel_type, _ = PersonWorkRelation.objects.get_or_create(
        name="mentioned in", name_reverse="mentiones"
    )
work_kind, _ = WorkType.objects.get_or_create(
    name='letter'
)
default_domain = "https://schnitzler-briefe.acdh.oeaw.ac.at"

In [7]:
for x in files:
    doc = ABWSParser(x)
    written_date = doc.get_written_date()
    try:
        date = pd.to_datetime(doc.get_date()).strftime("%Y-%m-%d")
    except ValueError:
        date = None
    legacy_id = "{}/{}".format(default_domain, os.path.split(x)[1])
    work, _ = Work.objects.get_or_create(name=doc.get_title_str())
    work.kind = work_kind
    Uri.objects.get_or_create(uri=legacy_id, entity=work)
    work.start_date_written = written_date
    work.end_date_written = written_date
    if date:
        work.start_date = date
        work.end_date = date
    work.save(parse_dates=False)
    work.collection.add(col)
    text, _ = Text.objects.get_or_create(
        text=doc.xml_to_str(), kind=text_type
    )
    work.text.add(text)
    for x in doc.mentioned_pers():
        try:
            pers_uri = Uri.objects.get(uri=x)
        except:
            pers_uri = None
        if pers_uri:
            pers = APerson.objects.get(id=pers_uri.entity.id)
            pw, _ = PersonWork.objects.get_or_create(
                related_person=pers,
                related_work = work,
                relation_type=rel_type,
            )
            if date:
                pw.start_date=date
                pw.end_date=date
            pw.start_date_written = written_date
            pw.end_date_written = written_date
            pw.save(parse_dates=False)
        else:
            pass