In [None]:
from acdh_tei_pyutils.tei import TeiReader
from acdh_tei_pyutils.utils import get_xmlid, normalize_string
from tqdm import tqdm
from icecream import ic
from normdata.utils import import_from_normdata

In [None]:
source_file = "https://github.com/hermann-bahr/bahr-index/raw/main/tsn/listPerson_TSN_1.xml"

In [None]:
doc = TeiReader(source_file)

In [None]:
nsmap = doc.nsmap
bahr_url = "https://hermanbahrtextverzeichnis/"
bahr_domain = "hermanbahrtextverzeichnis"
bahr_col, _ = Collection.objects.get_or_create(name="Bahr Textverzeichnis")

In [None]:
no_gnd = set()
no_wikidata = set()
not_created = set()
for x in tqdm(doc.any_xpath(".//tei:person")):
    entity = False
    hbtv_uri = x.xpath(".//tei:idno[@type='TSN']/text()", namespaces=nsmap)[0]
    hbtv_url = f"{bahr_url}{hbtv_uri}"
    try:
        gnd = x.xpath(".//tei:idno[@type='gnd']/text()", namespaces=nsmap)[0]
        entity = import_from_normdata(gnd, 'person')
    except IndexError:
        no_gnd.add(hbtv_uri)
        try:
            wikidata = x.xpath(".//tei:idno[@type='wikidata']/text()", namespaces=nsmap)[0]
            wikidata_url = f"http://www.wikidata.org/entity/{wikidata}"
            entity = import_from_normdata(wikidata_url, 'person')
        except IndexError:
            no_wikidata.add(hbtv_uri)
            not_created.add(hbtv_uri)
    if entity:
        entity.collection.add(bahr_col)
        uri, _ = Uri.objects.get_or_create(uri=hbtv_url, domain=bahr_domain)
        uri.entity = entity
        uri.save()
   

In [None]:
print("now import entities without normdata records")

In [None]:
for x in tqdm(doc.any_xpath(".//tei:person")):
    if len(x.xpath(".//tei:idno", namespaces=nsmap)) == 1:
        hbtv_uri = x.xpath(".//tei:idno[@type='TSN']/text()", namespaces=nsmap)[0]
        hbtv_url = f"{bahr_url}{hbtv_uri}"
        uri, _ = Uri.objects.get_or_create(uri=hbtv_url, domain=bahr_domain)
        if uri.entity:
            continue
        else:
            try:
                name = x.xpath("./tei:persName[1]/tei:surname[1]/text()", namespaces=nsmap)[0]
            except IndexError:
                name = ""
            try:
                first_name = x.xpath("./tei:persName[1]/tei:forename[1]/text()", namespaces=nsmap)[0]
            except IndexError:
                first_name = ""
            try:
                start_date_written = x.xpath("./tei:birth/tei:date/text()", namespaces=nsmap)[0]
            except IndexError:
                start_date_written = ""
            try:
                end_date_written = x.xpath("./tei:death/tei:date/text()", namespaces=nsmap)[0]
            except IndexError:
                end_date_written = ""
            try:
                gender = x.xpath("./tei:sex", namespaces=nsmap)[0]
                gender = gender.attrib["value"]
            except IndexError:
                gender = ""
            entity = Person.objects.create(
                name=name,
                first_name=first_name,
                start_date_written=start_date_written,
                end_date_written=end_date_written,
                gender=gender
            )
            entity.collection.add(bahr_col)
            uri.entity = entity
            uri.save()

In [None]:
for x in tqdm(doc.any_xpath(".//tei:person[./tei:occupation]")):
    hbtv_uri = x.xpath(".//tei:idno[@type='TSN']/text()", namespaces=nsmap)[0]
    hbtv_url = f"{bahr_url}{hbtv_uri}"
    try:
        uri = Uri.objects.get(uri=hbtv_url, domain=bahr_domain)
    except:
        ic(hbtv_url)
    try:
        entity = uri.entity.get_child_entity()
    except:
        ic(uri)
        continue
    for o in x.xpath("./tei:occupation/text()", namespaces=nsmap):
        profession, _ = ProfessionType.objects.get_or_create(name=o)
        entity.profession.add(profession)

In [None]:
for x in tqdm(doc.any_xpath(".//tei:person[@xml:id][./tei:listBibl]")):
    ref = []
    for y in x.xpath(".//tei:bibl[./tei:biblScope]", namespaces=nsmap):
        volume = y.xpath("./tei:biblScope[@unit='volume']", namespaces=nsmap)[0].text
        page = y.xpath("./tei:biblScope[@unit='page']", namespaces=nsmap)[0].text
        if volume.endswith("."):
            volume = normalize_string(volume[:-1])
        try:
            page = f"S. {normalize_string(page)}"
        except AttributeError:
            continue
        quote = f"{volume}, {page}."
        ref.append(quote)
    all_quotes = (" *** ".join(ref))  
        
    hbtv_uri = get_xmlid(x)
    work_uri = f"{bahr_url}{hbtv_uri}"
    work = Uri.objects.get(uri=work_uri).entity.get_child_entity()
    orig_ref = work.references
    if orig_ref:
        if all_quotes in orig_ref:
            continue
    try:
        new_ref = orig_ref + "\n" + all_quotes
    except TypeError:
        new_ref = "\n" + all_quotes
    work.references = new_ref
    work.save()