In [None]:
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd

def extract_xml_metadata(xml_path: str) -> dict:
    """Extract metadata from Transkribus XML file"""
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        metadata = {
            'docId': root.find('docId').text if root.find('docId') is not None else None,
            'sesamid': root.find('title').text if root.find('title') is not None else None,
            'nrOfPages': root.find('nrOfPages').text if root.find('nrOfPages') is not None else None
        }
        
        return metadata
    except Exception as e:
        print(f"Error processing {xml_path}: {str(e)}")
        return {'docId': None, 'sesamid': None, 'nrOfPages': None}

def process_all_xml_files(directory: str) -> pd.DataFrame:
    """Process all XML files in directory and return as DataFrame"""
    xml_files = list(Path(directory).glob('*.xml'))
    metadata_list = []
    
    for xml_file in xml_files:
        metadata = extract_xml_metadata(str(xml_file))
        metadata['filename'] = xml_file.stem
        metadata_list.append(metadata)
    
    return pd.DataFrame(metadata_list)

In [None]:
# Example usage
xml_dir = "./path/to/xml/files"  # Update with your XML directory path
df_xml = process_all_xml_files(xml_dir)

print("XML Metadata summary:")
print(df_xml.head())
print("\nTotal files processed:", len(df_xml))