# **1. Libaries and files**

In [33]:
#Importing libraries
import os
import glob
import re
from bs4 import BeautifulSoup, Tag, NavigableString
import xml.etree.ElementTree as ET
from xml.dom import minidom
from xml.etree.ElementTree import Element
from xml.etree.ElementTree import SubElement
import pandas as pd
from lxml import etree

In [34]:
#Change accordingly
file_path_index="C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/tei/Index/index_person_place_org.xml"

# **2. Creating csv with index**

In [35]:
#Accessing index
tree = etree.parse(file_path_index)
root = tree.getroot()

In [36]:
# Define the TEI namespace
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

## **2.1. Index person**

In [37]:
#Created with the help of u:ai
# Extract <person> elements
persons = root.xpath('//tei:listPerson/tei:person', namespaces=ns)
print(f"Number of <person> elements found: {len(persons)}")

# Extract data
data = []
for person in persons:
    person_data = {}
    # Extract xml:id
    person_data['ID'] = person.get('{http://www.w3.org/XML/1998/namespace}id')
    
    # Extract forename
    forename = person.find('tei:persName/tei:forename', namespaces=ns)
    person_data['forename'] = forename.text if forename is not None else None
    
    # Extract surname
    surname = person.find('tei:persName/tei:surname', namespaces=ns)
    person_data['surname'] = surname.text if surname is not None else None

    # Extract surname maiden
    surname_maiden = person.find('tei:persName/tei:surname[@type="maiden"]', namespaces=ns)
    person_data['surname maiden']= surname_maiden.text if surname_maiden is not None else None
    
    # Extract occupations (can be multiple)
    occupations = person.findall('tei:occupation', namespaces=ns)
    person_data['occupations'] = "; ".join([occ.text for occ in occupations if occ.text])

    # Extract affiliations (can be multiple)
    affiliations = person.findall('tei:affiliation', namespaces=ns)
    affiliation_data = []
    for affiliation in affiliations:
        # Extract type and subtype attributes
        aff_type = affiliation.get('type')
        aff_subtype = affiliation.get('subtype')
        
        # Extract orgName content and ref attribute
        org_name = affiliation.find('tei:orgName', namespaces=ns)
        org_name_text = org_name.text if org_name is not None else None
        org_name_ref = org_name.get('ref') if org_name is not None else None
        
        # Combine affiliation information into a string
        affiliation_info = f"type: {aff_type}, subtype: {aff_subtype}, orgName: {org_name_text}, ref: {org_name_ref}"
        affiliation_data.append(affiliation_info)
    
    # Join all affiliations into a single string (or store as a list if preferred)
    person_data['affiliations'] = "; ".join(affiliation_data)
    
    # Extract death date and description
    death = person.find('tei:death', namespaces=ns)
    person_data['death_date'] = death.get('when') if death is not None else None
    
    # Extract idno (can be multiple)
    idnos = person.findall('tei:idno', namespaces=ns)
    person_data['idnos'] = "; ".join([f"{idno.get('type')} ({idno.get('subtype')}): {idno.text}" for idno in idnos if idno.text])
    
    # Append the person's data to the list
    data.append(person_data)

# Convert to pandas DataFrame
df_person = pd.DataFrame(data)

# Save to a CSV file (optional)
df_person.to_csv("C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/data/csv/person_index.csv", index=False)

# Display the DataFrame
print(df_person)

Number of <person> elements found: 74
               ID  forename                 surname surname maiden  \
0   LudwigLobmeyr    Ludwig                 Lobmeyr           None   
1   JosephLobmeyr    Joseph                 Lobmeyr           None   
2    FranzLobmeyr     Franz                 Lobmeyr           None   
3    LouiseKralik    Louise  Kralik von Meyrswalden        Lobmayr   
4     HannsKralik     Hanns  Kralik von Meyrswalden           None   
..            ...       ...                     ...            ...   
69   ErnstDohnany     Ernst            von Dohnányi           None   
70      Beethoven    Ludwig           van Beethoven           None   
71         Mozart  Wolfgang                  Mozart           None   
72          Haydn    Joseph                   Haydn           None   
73        LeoXIII       Leo                    XIII           None   

                       occupations  \
0             k. k. Hofglashändler   
1             k. k. Hofglashändler   
2   k. 

## **2.2. Index place**

In [38]:
# Extract <place> elements
places = root.xpath('//tei:listPlace/tei:place', namespaces=ns)
print(f"Number of <place> elements found: {len(places)}")

# Extract data
data_place = []
for place in places:
    place_data = {}
    # Extract xml:id
    place_data['ID'] = place.get('{http://www.w3.org/XML/1998/namespace}id')
    
    # Extract placeName
    placename = place.find('tei:placeName', namespaces=ns)
    place_data['name'] = placename.text if placename is not None else None

    #Extract type placeName
    placename_ref=placename.get('type') if placename is not None else None
    place_data['type'] = f"{placename_ref}" if placename is not None else None
    
    # Extract geo
    geo = place.find('tei:location/tei:geo', namespaces=ns)
    place_data['geo'] = geo.text if geo is not None else None
    
    # Extract idno (can be multiple)
    idnos = place.findall('tei:idno', namespaces=ns)
    place_data['idnos'] = "; ".join([f"{idno.get('type')} ({idno.get('subtype')}): {idno.text}" for idno in idnos if idno.text])
    
    # Append the data to the list
    data_place.append(place_data)

# Convert to pandas DataFrame
df_place = pd.DataFrame(data_place)

# Save to a CSV file
df_place.to_csv("C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/data/csv/place_index.csv", index=False)

# Display the DataFrame
print(df_place)

Number of <place> elements found: 28
                         ID  \
0                      Wien   
1           Weimarerplatz_3   
2         DöblingerFriedhof   
3           WähringerKirche   
4        Rosensteingasse_32   
5   HernalserHauptstraße_81   
6           EhrengrabKralik   
7               Stephansdom   
8          StMarxerFriedhof   
9                    Weimar   
10          Zentralfriedhof   
11          GoldeggGasse_19   
12       Favoritenstraße_27   
13              Vorderbrühl   
14        Elisabethstraße_1   
15        Augustinerkircher   
16        KärntnerStraße_21   
17               Rennweg_91   
18          ErdbergerKirche   
19            KircheMödling   
20            Eleonorenhain   
21                 Annathal   
22   Zentralfriedhof_Halle1   
23        WeimarerStraße_89   
24            Bendlgasse_24   
25                   Berlin   
26               Seccession   
27                     Linz   

                                                 name      type

## **2.3. Index organisation**

In [39]:
# Extract <org> elements
orgs = root.xpath('//tei:listOrg/tei:org', namespaces=ns)
print(f"Number of <org> elements found: {len(org)}")

# Extract data
data_org = []
for org in orgs:
    org_data = {}
    # Extract xml:id
    org_data['ID'] = org.get('{http://www.w3.org/XML/1998/namespace}id')
    
    # Extract orgName
    orgname = org.find('tei:orgName', namespaces=ns)
    org_data['name'] = orgname.text if orgname is not None else None
    
    # Extract idno (can be multiple)
    idnos = org.findall('tei:idno', namespaces=ns)
    org_data['idnos'] = "; ".join([f"{idno.get('type')} ({idno.get('subtype')}): {idno.text}" for idno in idnos if idno.text])
    
    # Append the data to the list
    data_org.append(org_data)

# Convert to pandas DataFrame
df_org = pd.DataFrame(data_org)

# Save to a CSV file
df_org.to_csv("C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/data/csv/org_index.csv", index=False)

# Display the DataFrame
print(df_org)

Number of <org> elements found: 2
                             ID  \
0                     JLLobmeyr   
1                        Gerold   
2                    Herrenhaus   
3                        AkBild   
4        GenossenschaftKünstler   
5              Schillerstiftung   
6                 TheyerHartmut   
7   StädtischeLeichenbestattung   
8                    DruckBileg   
9    GeneralinspektionEisenbahn   
10                Statthalterei   
11           Enterprisefunebres   
12                   Bärenhöhle   
13                     Nordbahn   
14         EisenbahnMinisterium   
15              BestattungPayer   
16                     Lischkar   
17                    Vaterland   
18                  FreiePresse   
19                   Extrablatt   
20                   MeyrsNeffe   
21         WienerKonservatorium   

                                                 name  \
0                                    J. & L. Lobmeyr    
1                                         Gero

## **2.4. Index relationships**

In [40]:
#Created with the help of u:ai
# Extract <relation> elements
relations = root.xpath('//tei:listRelation/tei:relation', namespaces=ns)

# Extract data
relation_data = []
for relation in relations:
    relation_entry = {}
    # Extract relation name
    relation_entry['name'] = relation.get('name')
    
    # Extract active attribute
    relation_entry['active'] = relation.get('active')
    
    # Extract passive attribute
    relation_entry['passive'] = relation.get('passive')
    
    # Extract mutual attribute
    relation_entry['mutual'] = relation.get('mutual')
    
    # Append the relation entry to the list
    relation_data.append(relation_entry)

# Convert to pandas DataFrame
df_relations = pd.DataFrame(relation_data)

# Save to a CSV file (optional)
df_relations.to_csv("C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/data/csv/relation_index.csv", index=False)


# Display the DataFrame
print(df_relations)

           name           active  \
0        parent      #AugustRath   
1        parent  #AloisiaLobmeyr   
2        parent    #LouiseKralik   
3        parent      #MaiaKralik   
4        parent   #RichardKralik   
5        parent  #AloisiaPichler   
6        parent   #EduardPichler   
7        parent   #RichardKralik   
8        parent      #MaiaKralik   
9   grandparent    #LouiseKralik   
10  grandparent      #MaiaKralik   
11  grandparent   #RichardKralik   
12      sibling             None   
13      sibling             None   
14      sibling             None   
15      sibling             None   
16      sibling             None   
17      sibling             None   
18      sibling             None   
19        uncle   #LudwigLobmeyr   
20        uncle    #LudwigKralik   
21       spouse             None   
22       spouse             None   
23       spouse             None   
24       spouse             None   
25      partner             None   

                           

# **3. Converting csv to json**

In [43]:
#df_person to json
person_json=df_person.to_json(orient='records', indent=4)
with open('C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/data/json/person_index.json', 'w') as json_file:
    json_file.write(person_json)

#df_place to json
place_json=df_place.to_json(orient='records', indent=4)
with open('C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/data/json/place_index.json', 'w') as json_file:
    json_file.write(place_json)

#df_org to json
org_json=df_org.to_json(orient='records', indent=4)
with open('C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/data/json/org_index.json', 'w') as json_file:
    json_file.write(org_json)

#df_relation
relation_json=df_relations.to_json(orient='records', indent=4)
with open('C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/data/json/relation_index.json', 'w') as json_file:
    json_file.write(relation_json)