### Named Entity Recognition

The Amharic named entity dataset is annotated within the SAY project
at New Mexico State University’s Computing Research Laboratory was used. 
The data is annotated with six classes, namely 
- `person`
- `location`
- `organization`
- `time`
- `title`
- `others`

In [35]:
from lxml import etree
import xml.etree.ElementTree as ET
import os


def fix_invalid_xml(file_path):
    try:
        parser = etree.XMLParser(recover=True)  # Set recover=True to handle invalid XML
        tree = etree.parse(file_path, parser)
        fixed_xml = etree.tostring(tree, encoding='unicode', pretty_print=True)
        
        with open(file_path, 'w') as f:
            f.write(fixed_xml)
        
        # print(f"The XML file '{file_path}' has been fixed and saved successfully.")
    
    except IOError:
        print("Error: File not found.")
    except etree.XMLSyntaxError as e:
        print(f"Error: {e}")


def extract_content_by_class(file_path, all_files_content):
    try:
        fix_invalid_xml(file_path)

        tree = ET.parse(file_path)
        root = tree.getroot()

        for element in root.iter('font'):
            class_name = element.get('class')
            text = element.text.strip()

            if class_name in all_files_content:
                all_files_content[class_name].add(text)
            else:
                all_files_content[class_name] = set([text])

        return all_files_content

    except FileNotFoundError:
        print("File not found.")
        return None
    

def extract_content_from_xml_folder(folder_path):
    all_files_content = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".xml"):
            try:
                file_path = os.path.join(folder_path, filename)
                extract_content_by_class(file_path, all_files_content)
                
            except Exception as e:
                print(f"erro occured filename: {filename}, error: {e}")

            
    return all_files_content


In [37]:
# Example usage:
folder_path = '../data/data/amharic/tagged/nmsu-say/'  # Path to your folder containing XML files
all_files_content = extract_content_from_xml_folder(folder_path)
if all_files_content:
    print("Content extracted from all XML files based on their class:")
    for class_name, content_list in all_files_content.items():
        print(f"\tClass: {class_name}, count: {len(content_list)}")
        


Content extracted from all XML files based on their class:
	Class: ORG, count: 850
	Class: TIME, count: 653
	Class: TTL, count: 163
	Class: PER, count: 723
	Class: LOC, count: 641
