In [1]:
from apis_core.apis_metainfo.models import Collection, Uri
from apis_core.apis_entities.models import Event, Place, Institution
from apis_core.apis_vocabularies.models import EventType, InstitutionEventRelation, PlaceEventRelation
from apis_core.apis_relations.models import InstitutionEvent, PlaceEvent
import glob
import json
from tqdm import tqdm
from datetime import datetime, date

In [2]:
domain = "burgtheater"
base_url = "https://kulturerbe.burgtheater.at/event/"
mapping = {
    "Neueinstudierung": "Theaterneueinstudierung",
    "Premiere": "Theaterpremiere",
    "Erstaufführung": "Theatererstaufführung",
    "Neuinszenierung": "Theaterneuinszenierung",
    "Uraufführung": "Theateruraufführung",
    "Theateraufführung": "Theateraufführung"
}

In [3]:
col, _ = Collection.objects.get_or_create(name="Burgtheater")

In [4]:
files = glob.glob("/home/csae8092/repos/burgtherater-event-crawler/data/events/*.json")

In [5]:
start = date(1850, 1, 1)
end = date(1938, 12, 31)
burgtheater = Place.objects.get(id=14)
burgtheater_inst = Institution.objects.get(id=36247)
bt_alt = Place.objects.get(id=185621)
veranstaltet = InstitutionEventRelation.objects.get(name="veranstaltet")
ort_event = PlaceEventRelation.objects.get(name="Veranstaltungsort von")
akademietheater = Place.objects.get(name="Akademietheater")


In [None]:
items = []
for x in tqdm(files, total=len(files)):
    with open(x, "r", encoding="utf-8") as fp:
        data = json.load(fp)
        for y in data["productions"]:
            item = {}
            title = y["title"]
            date_str = y["date"].split("T")[0]
            parsed_date = datetime.strptime(date_str, "%Y-%m-%d").date()
            if start <= parsed_date <= end:
                date_formatted = f"{int(date_str.split('-')[2])}.{int(date_str.split('-')[1])}.{date_str.split('-')[0]}"
                uri = f"{base_url}{y['_id']}"
                kind = y["premiere"]
                if kind:
                    pass
                else:
                    kind = "Theateraufführung"
                pmb_kind = mapping[kind]
                event_type, _ = EventType.objects.get_or_create(name=pmb_kind)
                item["name"] = f"{kind} von {title}, {date_formatted}"
                item["start_date_written"] = date_str
                item["end_date_written"] = date_str
                item["kind"] = event_type
                entity, _ = Event.objects.get_or_create(**item)
                entity.collection.add(col)
                pmb_uri = Uri.objects.get_or_create(uri=uri, domain=domain, entity=entity)
                items.append(item)
                # link to institution
                InstitutionEvent.objects.get_or_create(
                    related_institution=burgtheater_inst,
                    related_event=entity,
                    start_date_written=date_str,
                    end_date_written=date_str,
                    relation_type=veranstaltet
                )
                # link to place
                if parsed_date > date(1888, 10, 12):
                    place = burgtheater
                else:
                    place = bt_alt
                if y["location"] == "Burgtheater":
                    place = place
                else:
                    place = akademietheater
                PlaceEvent.objects.get_or_create(
                    related_place=place,
                    related_event=entity,
                    start_date_written=date_str,
                    end_date_written=date_str,
                    relation_type=ort_event
                )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 898/898 [00:15<00:00, 57.08it/s]


In [7]:
import pandas as pd

In [8]:
df = pd.DataFrame(items)

In [9]:
df

Unnamed: 0,name,start_date_written,end_date_written,kind,place
0,"Premiere von Das Mädel im Schatten, 19.12.1936",1936-12-19,1936-12-19,Theaterpremiere,Akademietheater
1,"Theateraufführung von Das Mädel im Schatten, 2...",1936-12-20,1936-12-20,Theateraufführung,Akademietheater
2,"Theateraufführung von Das Mädel im Schatten, 2...",1936-12-22,1936-12-22,Theateraufführung,Akademietheater
3,"Theateraufführung von Das Mädel im Schatten, 2...",1936-12-23,1936-12-23,Theateraufführung,Akademietheater
4,"Theateraufführung von Das Mädel im Schatten, 2...",1936-12-25,1936-12-25,Theateraufführung,Akademietheater
...,...,...,...,...,...
27723,"Theateraufführung von Der Kuss, 13.9.1882",1882-09-13,1882-09-13,Theateraufführung,Burgtheater
27724,"Theateraufführung von Der Kuss, 5.1.1883",1883-01-05,1883-01-05,Theateraufführung,Burgtheater
27725,"Theateraufführung von Der Kuss, 26.3.1887",1887-03-26,1887-03-26,Theateraufführung,Burgtheater
27726,"Theateraufführung von Der Kuss, 29.3.1887",1887-03-29,1887-03-29,Theateraufführung,Burgtheater


In [10]:
df = df.sort_values('start_date_written')

In [11]:
df.to_csv("foo.csv", index=False)