# **1. Libaries and files**

In [38]:
#Importing libraries
import os
import glob
import re
from bs4 import BeautifulSoup, Tag, NavigableString
import xml.etree.ElementTree as ET
from xml.dom import minidom
from xml.etree.ElementTree import Element
from xml.etree.ElementTree import SubElement
import pandas as pd
from lxml import etree

In [55]:
file_path_index="C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/tei/Index/index_person_place_org.xml"

# **2. Creating csv with index**

In [56]:
#Accessing index
tree = etree.parse(file_path_index)
root = tree.getroot()

In [57]:
# Define the TEI namespace
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

## **2.1. Index person**

In [62]:
#Created with the help of u:ai
# Extract <person> elements
persons = root.xpath('//tei:listPerson/tei:person', namespaces=ns)
print(f"Number of <person> elements found: {len(persons)}")

# Extract data
data = []
for person in persons:
    person_data = {}
    # Extract xml:id
    person_data['ID'] = person.get('{http://www.w3.org/XML/1998/namespace}id')
    
    # Extract forename
    forename = person.find('tei:persName/tei:forename', namespaces=ns)
    person_data['forename'] = forename.text if forename is not None else None
    
    # Extract surname
    surname = person.find('tei:persName/tei:surname', namespaces=ns)
    person_data['surname'] = surname.text if surname is not None else None

    # Extract surname maiden
    surname_maiden = person.find('tei:persName/tei:surname[@type="maiden"]', namespaces=ns)
    person_data['surname maiden']= surname_maiden.text if surname_maiden is not None else None
    
    # Extract occupations (can be multiple)
    occupations = person.findall('tei:occupation', namespaces=ns)
    person_data['occupations'] = "; ".join([occ.text for occ in occupations if occ.text])

    # Extract affiliations (can be multiple)
    affiliations = person.findall('tei:affiliation', namespaces=ns)
    affiliation_data = []
    for affiliation in affiliations:
        # Extract type and subtype attributes
        aff_type = affiliation.get('type')
        aff_subtype = affiliation.get('subtype')
        
        # Extract orgName content and ref attribute
        org_name = affiliation.find('tei:orgName', namespaces=ns)
        org_name_text = org_name.text if org_name is not None else None
        org_name_ref = org_name.get('ref') if org_name is not None else None
        
        # Combine affiliation information into a string
        affiliation_info = f"type: {aff_type}, subtype: {aff_subtype}, orgName: {org_name_text}, ref: {org_name_ref}"
        affiliation_data.append(affiliation_info)
    
    # Join all affiliations into a single string (or store as a list if preferred)
    person_data['affiliations'] = "; ".join(affiliation_data)
    
    # Extract death date and description
    death = person.find('tei:death', namespaces=ns)
    person_data['death_date'] = death.get('when') if death is not None else None
    
    # Extract idno (can be multiple)
    idnos = person.findall('tei:idno', namespaces=ns)
    person_data['idnos'] = "; ".join([f"{idno.get('type')} ({idno.get('subtype')}): {idno.text}" for idno in idnos if idno.text])
    
    # Append the person's data to the list
    data.append(person_data)

# Convert to pandas DataFrame
df_person = pd.DataFrame(data)

# Save to a CSV file (optional)
df_person.to_csv("C:/Users/annab/OneDrive/Desktop/Kralik/mathilde_kralik/data/csv/person_index.csv", index=False)

# Display the DataFrame
print(df_person)

Number of <person> elements found: 54
                        ID   forename                        surname  \
0            LudwigLobmeyr     Ludwig                        Lobmeyr   
1            JosephLobmeyr     Joseph                        Lobmeyr   
2             FranzLobmeyr      Franz                        Lobmeyr   
3             LouiseKralik     Louise         Kralik von Meyrswalden   
4              HannsKralik      Hanns         Kralik von Meyrswalden   
5        HeinrichKralikSen   Heinrich         Kralik von Meyrswalden   
6             MathildeRath   Mathilde                           Rath   
7            WilhelmKralik    Wilhelm         Kralik von Meyrswalden   
8             LudwigKralik     Ludwig  Ritter Kralik von Meyrswalden   
9         HildegardLobmeyr  Hildegard                        Lobmeyr   
10          AloisiaLobmeyr    Aloisia                        Lobmeyr   
11              CarlGerold       Carl                         Gerold   
12           RudolfLobmeyr