In [None]:
import glob
import os
import typesense
from tqdm import tqdm
from typesense.exceptions import ObjectNotFound
from acdh_tei_pyutils.tei import TeiReader
from acdh_tei_pyutils.utils import extract_fulltext, get_xmlid, make_entity_label

In [None]:
# lets define some varibales, we use `os.environ` because IRL we wouldn't add all those infos into the code
TYPESENSE_COLLECTION_NAME = "maechtekongresse-play"
TYPESENSE_API_KEY = os.environ.get("TYPESENSE_API_KEY", "xyz")
TYPESENSE_TIMEOUT = os.environ.get("TYPESENSE_TIMEOUT", "120")
TYPESENSE_HOST = os.environ.get("TYPESENSE_HOST", "localhost")
TYPESENSE_PORT = os.environ.get("TYPESENSE_PORT", "8108")
TYPESENSE_PROTOCOL = os.environ.get("TYPESENSE_PROTOCOL", "http")

In [None]:
# here we initialize a typesense client (using the variables declared above)

client = typesense.Client(
    {
        "nodes": [
            {
                "host": TYPESENSE_HOST,
                "port": TYPESENSE_PORT,
                "protocol": TYPESENSE_PROTOCOL
            }
        ],
        "api_key": TYPESENSE_API_KEY,
        "connection_timeout_seconds": int(TYPESENSE_TIMEOUT)
    }
)

In [None]:
# now we define our schema, which in typesense is called a collection
# see https://typesense.org/docs/0.25.2/api/collections.html
current_schema = {
    "name": TYPESENSE_COLLECTION_NAME,
    "enable_nested_fields": True,
    "fields": [
        {"name": "id", "type": "string"},
        {"name": "title", "type": "string"},
        {"name": "full_text", "type": "string"},
        {
            "name": "year",
            "type": "int32",
            "facet": True,
        },
        {"name": "conference", "type": "string", "facet": True},
        {"name": "persons", "type": "object[]", "facet": True, "optional": True},
        {"name": "places", "type": "object[]", "facet": True, "optional": True},
    ]
}

In [None]:
# this step is only needed in cases you want to change an existing schema/collection, or to overwrite existing documents in the collection
# by deleting the collection you also delete all related documents, but we don't care, because reindexing is fast enough
try:
    client.collections[TYPESENSE_COLLECTION_NAME].delete()
except ObjectNotFound:
    pass

In [None]:
# now lets create our collection for good
client.collections.create(current_schema)
# and have a look at the typesense dashboard http://localhost/#/

In [None]:
# now we need to create "documents" we'd like to ingest;
# this boils down to an list of dicts in python lingo
# for this we iterate over our TEI/XML files and extract info matching our typesense collection
files = sorted(glob.glob("typesense_data/*xml"))
records = []
for x in tqdm(files, total=len(files)):
    doc = TeiReader(x)  # we are parsing the xml
    document = {}  # initialising an empty dict for our document we want to index
    _, doc_id = os.path.split(x)  # generating the doc-id from the filename
    document["id"] = doc_id
    document["title"] = doc.any_xpath(".//tei:title[@type='main']")[0].text # getting the title, quick and dirty method
    document["full_text"] = extract_fulltext(doc.any_xpath(".//tei:body")[0])[:5]  # even quicker and dirtier
    document["conference"] = doc_id.split("_")[0] # extract the place of the conference
    try:
        year = doc.any_xpath(".//tei:origin/tei:date/@when")[0][:4]
    except IndexError:  # of course we deal with incloplete data, Verona_I_8.xml is without date, so here one way to circumvent this
        year = "1000"
    document["year"] = int(year)  # remeber our schema: '"type": "int32"
    document["persons"] = []
    document["place"] = []

    # now to the optional facets:
    for y in doc.any_xpath(".//tei:back//tei:person"):
        item = {
            "id": get_xmlid(y),  # praise acdh_tei_pyutils for its helper functions
            "label": make_entity_label(y)[0] # praise acdh_tei_pyutils even more
        }
        document["persons"].append(item) 
    # ToDo: soemthing similar for places
    records.append(document)


In [None]:
# now populate our collection with documents
client.collections[TYPESENSE_COLLECTION_NAME].documents.import_(records)  # this should fail the first time