In [1]:
import re
import xml.etree.ElementTree as ET
from xml.dom import minidom

In [5]:
def markdown_to_xml(markdown_file, xml_file):
    with open(markdown_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Create the root element
    root = ET.Element("document")
    current_element = root
    stack = [root]
    
    # Split the content into lines
    lines = content.split('\n')
    
    headers = []
    table = None
    
    for line in lines:
        # Check for headings
        heading_match = re.match(r'^(#+)\s+(.+)$', line)
        if heading_match:
            level = len(heading_match.group(1))
            text = heading_match.group(2)
            
            # Adjust the stack based on the heading level
            while len(stack) > level:
                stack.pop()
            
            # Create a new element for the heading
            new_element = ET.SubElement(stack[-1], f"section{level}")
            new_element.set("title", text)
            
            stack.append(new_element)
            current_element = new_element
            continue
        
        # Check for table headers
        if '|' in line and '-|-' in next(iter(lines[lines.index(line)+1:]), ''):
            headers = [header.strip() for header in line.split('|') if header.strip()]
            table = ET.SubElement(current_element, "table")
            continue
        
        # Check for table rows
        if '|' in line and table is not None and headers:
            row_data = [cell.strip() for cell in line.split('|') if cell.strip()]
            if len(row_data) == len(headers):
                row = ET.SubElement(table, "row")
                for header, value in zip(headers, row_data):
                    cell = ET.SubElement(row, header)
                    cell.text = value
            continue
        
        # Regular text content
        if line.strip():
            para = ET.SubElement(current_element, "paragraph")
            para.text = line.strip()
        
        # Reset table and headers if we're not in a table anymore
        if '|' not in line:
            table = None
            headers = []

    # Create a formatted XML string
    rough_string = ET.tostring(root, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    pretty_xml = reparsed.toprettyxml(indent="  ")

    # Write the XML to a file
    with open(xml_file, 'w', encoding='utf-8') as f:
        f.write(pretty_xml)

In [6]:
# Usage
markdown_to_xml('/Users/luke.thompson/git/protocols/protocol_PCR_16S_V4V5.md', 'test.xml')
