### CACO Amharic Corpus

In [6]:
import xml.etree.ElementTree as ET
from lxml import etree


def fix_invalid_xml(file_path):
    try:
        parser = etree.XMLParser(recover=True)  # Set recover=True to handle invalid XML
        tree = etree.parse(file_path, parser)
        fixed_xml = etree.tostring(tree, encoding='unicode', pretty_print=True)
        
        with open(file_path, 'w') as f:
            f.write(fixed_xml)
        
        # print(f"The XML file '{file_path}' has been fixed and saved successfully.")
    
    except IOError:
        print("Error: File not found.")
    except etree.XMLSyntaxError as e:
        print(f"Error: {e}")



def extract_content_from_xml(xml_file_path):
    """
    Extracts content from an XML file.
    
    Args:
    xml_file_path (str): The path to the XML file.
    
    Returns:
    content (dict): A dictionary containing extracted content from the XML file.
    """
    content = {}  # Initialize an empty dictionary to store extracted content
    
    try:
        fix_invalid_xml(xml_file_path)
        tree = ET.parse(xml_file_path)  # Parse the XML file
        root = tree.getroot()  # Get the root element
        
        # Iterate through each element in the XML tree
        for element in root:
            # Extract relevant content (e.g., text, attributes) from each element
            # For demonstration, let's assume we're extracting text content from 'title' and 'body' elements
            if element.tag == 'title':
                content['title'] = element.text.strip() if element.text else None
            elif element.tag == 'body':
                content['body'] = element.text.strip() if element.text else None
            # Add more conditions to extract other content as needed
            
    except ET.ParseError as e:
        print(f"Error parsing XML file: {e}")
    
    return content

In [7]:
xml_file_path = '../data/Contemporary_Amharic_Corpus_(CACO)-version_1.1/CACO_TEXT.xml'  # Replace 'example.xml' with the path to your XML file
extracted_content = extract_content_from_xml(xml_file_path)
print("Extracted Content:")
# print(extracted_content)