In [187]:
from cobalt import AkomaNtosoDocument
import json
import re
import os



In [188]:
file_folder_xml = 'files_xml'
xml_files = os.listdir(file_folder_xml)
print(xml_files)


file_folder_json = 'files_json'
if not os.path.exists(file_folder_json): 
    os.mkdir(file_folder_json)


['822.113.xml', '822.112.xml', '822.111.xml', '822.115.xml', '822.114.xml']


In [189]:
def get_element_clean_text(el):
    if el is None:
        return ''

    texts = []

    if el.text:
        texts.append(el.text.replace('\xa0', ' ').strip())
        print('el text: ',el.text)

    for child_el in el.iterchildren():
        if child_el is not None and child_el.tag not in excluded_text_tags:
            child_txt = texts.append(get_element_clean_text(child_el))
            print('child_el text: ', child_el.text)
            if child_txt:
                texts.append(child_txt)
        if child_el.tail:
            texts.append(child_el.tail.replace('\xa0', ' ').strip())
    texts_joined = ' '.join(texts)
    print('texts_joined: ', texts_joined)
    return texts_joined

In [190]:
def process_paragraph_blocklist(lst, blocklist, article_lnk, section_titles, article_title,  level_eid):
    if blocklist is None:
        return

    # Within Blocklist, some articles have an ListIntroduction others don't. The Function handles both cases.
    # Process list introduction, if found
    if hasattr(blocklist, 'listIntroduction'):
        list_intro = blocklist.listIntroduction
        level_eid = list_intro.attrib["eId"]
        paragraph_txt = get_element_clean_text(list_intro)
        if paragraph_txt:
            lst.append({'text': paragraph_txt, 'metadata': article_lnk + doc_title + section_titles + article_title, '@eId': level_eid})

    # Process any list items (this allows to capture enumerated paragraphs), ex.: Art. 958 C
    if hasattr(blocklist, 'item'):
        for item in blocklist.item:
            if hasattr(item, 'p'):
                level_eid = item.attrib["eId"]
                paragraph_txt = get_element_clean_text(item.p)
                if paragraph_txt:
                    lst.append({'text': paragraph_txt, 'metadata': article_lnk + doc_title + section_titles + article_title, '@eId': level_eid})

            # Handle blocklists nested within items
            if hasattr(item, 'blockList'):
                process_paragraph_blocklist(lst, item.blockList, article_lnk, section_titles, article_title, level_eid)


def process_article(lst, article, section_titles, level_eid):
    """
    Extract paragraphs from the different sections.
    Add paragraphs, metadata, and Id to a dictionary with the format:
    {"text": "string containing each paragraph individually",
    "metadata": [Article Link + Article number + sections names in one list](list),
    "@eId": article and paragraph information (inner "eId" attribute)}
    """
    # Check article
    if article is None:
        return lst

    article_title = [str(' '.join([str(el.text).replace('\xa0', ' ').strip() for el in article.num.getchildren()]).strip())]
    article_eid = article.attrib["eId"]
    article_url = [f'{base_url}#{article_eid}']

    # this excludes articles with no paragraphs like Art. 40g
    if not hasattr(article, 'paragraph') or len(article.paragraph) == 0:
        return lst

    for paragraph in article.paragraph:
        # Extract Paragraphs from Articles. Will cover the articles where these are not nested like Art. 1
        if hasattr(paragraph.content, 'p'):
            # print('paragraph.content: ', paragraph.content.p)
            # for p in paragraph.content:
            #     print('paragraph child:', p)
            level_eid = paragraph.attrib['eId']
            paragraph_txt = get_element_clean_text(paragraph.content)


            if paragraph_txt:
                lst.append({'text': paragraph_txt, 'metadata': article_url + doc_title +  section_titles + article_title, '@eId': level_eid})

        # When an article has blocklist within content, it will call the function process_paragraph_blocklist
        # This occurs where there are enumerated items within an article. Ex. Art. 24
        if hasattr(paragraph.content, 'blockList'):
            process_paragraph_blocklist(lst, paragraph.content.blockList, article_url, section_titles, article_title, level_eid)

    return lst


def process_sections(lst, sections, section_titles, level_eid='N/A'):
    """Retrieve all the sections and call function find_article"""
    for section in sections:
        if hasattr(section, 'num') or not section.num:

            section_title = str(get_element_clean_text(section.num)).replace('\xa0', ' ').strip()
            if hasattr(section, 'heading'):
                section_title += ' ' + str(' '.join([headingtext.text.strip() for headingtext in section.heading])).replace('\xa0', ' ').strip()
            lst = find_articles(lst, section, section_titles + [section_title], level_eid)
        else:
            lst = find_articles(lst, section, section_titles, level_eid)

    return lst



In [199]:
def find_articles(lst, parent, section_titles, level_eid='N/A'):
    """Look for article tag to trigger the function process_article"""
    # Check parent
    if parent is None:
        return lst

    # Find all articles that are children of this parent
    if hasattr(parent, 'article'):
        for article in parent.article:
            lst = process_article(lst, article, section_titles, level_eid)

    # Find all sections of type part
    if hasattr(parent, 'part'):
        # print('FOUND PARTS')
        lst = process_sections(lst, parent.part, section_titles, level_eid)

    if hasattr(parent, 'section'):
        lst = process_sections(lst, parent.section, section_titles, level_eid)

    # Find all sections of type title
    if hasattr(parent, 'title'):
        lst = process_sections(lst, parent.title, section_titles, level_eid)

    # Find all sections of type chapter
    if hasattr(parent, 'chapter'):
        lst = process_sections(lst, parent.chapter, section_titles, level_eid)

    # Find all sections of type level
    if hasattr(parent, 'level'):
        level_eid = parent.level.attrib['eId']
        lst = process_sections(lst, parent.level, section_titles, level_eid)

    # Find all sections of type book
    if hasattr(parent, 'book'):
        lst = process_sections(lst, parent.book, section_titles)

    return lst

In [200]:

xml_files=['822.111.xml']

for xml_file in xml_files:
    # open xml file
    print(xml_file)
    with open(os.path.join(file_folder_xml, xml_file), "rb") as file:
        xml_data = file.read()
    
    akn_doc = AkomaNtosoDocument(xml_data)
    print(akn_doc)
    base_url = akn_doc.root.act.meta.identification.FRBRExpression.FRBRuri.attrib["value"]
    base_url = base_url.replace('fedlex.data.admin', 'fedlex.admin')
    base_url = re.sub(r'\/\d{8}(\/\w+)$', '\\1', base_url)
    print(base_url)

    excluded_text_tags = [f'{{{akn_doc.namespace}}}{tag}' for tag in ['authorialNote', 'ref', 'num']]

    #get title and num of the document
    doc_title = [get_element_clean_text(akn_doc.root.act.preface.p[1].docTitle) + ' ' + get_element_clean_text(akn_doc.root.act.preface.p[2])]
    doc_num = ['SR ' + akn_doc.root.act.preface.p[0].docNumber.text]

    body = akn_doc.root.act.body
    print(hasattr(body, 'section'))

    list_data = []
    list_data = find_articles(list_data, body, [])
    print(list_data)

    # save it to json file
    file_path_json = os.path.join(file_folder_json, xml_file.replace('.xml', '.json'))

    with open(file_path_json, 'w', encoding='utf-8') as file:
        json.dump(list_data, file, indent=2, ensure_ascii=False)


    by_article = {}
    for elem in list_data:
        level_key = elem['metadata'][-1]
        article_key = elem['metadata'][3]
        new_key = level_key + article_key
        elem['metadata'].pop()
        # elem['metadata'][-1] += ' ZGB'
        if new_key not in by_article:
            by_article[new_key] = {'text': elem['text'], 'metadata': elem['metadata'], '@eIds': [elem['@eId']]}
        else:
            by_article[new_key]['text'] += ' ' + elem['text']
            by_article[new_key]['@eIds'].append(elem['@eId'])

    # removing the article link as a key that was used to group the data so that he exported data in the json format has the same structure
    values_by_article = list(by_article.values())

    file_path_json_by_article = os.path.join(file_folder_json, xml_file.replace('.xml', '_by_article.json'))
    with open(file_path_json_by_article, 'w', encoding='utf-8') as file:
        json.dump(values_by_article, file, indent=2, ensure_ascii=False)




822.111.xml
<cobalt.akn.AkomaNtosoDocument object at 0x10b8621e0>
https://fedlex.admin.ch/eli/cc/2000/243/de
el text:  Verordnung 1
texts_joined:  
child_el text:  None
texts_joined:  Verordnung 1  zum Arbeitsgesetz
el text:  (ArGV 1)
texts_joined:  (ArGV 1)
False
el text:  1. Kapitel: Geltungsbereich
texts_joined:  1. Kapitel: Geltungsbereich
el text:  1. Abschnitt: Begriffe
texts_joined:  1. Abschnitt: Begriffe


KeyError: 'eId'