# XML Parser Test

In [2]:
import re
from collections import Counter
from lxml import etree

tree = etree.parse('../datasets/Kranjska-xml/Corpus-Kranjska.xml')
tree.xinclude()
root = tree.getroot()

xmlns='http://www.w3.org/XML/1998/namespace'

{None: 'http://www.tei-c.org/ns/1.0', 'xi': 'http://www.w3.org/2001/XInclude'}

This is a small test to see if we can traverse the DOM. We count the number of meetings in year 1895.

DOC: https://lxml.de/tutorial.html#parsing-from-strings-and-files

In [64]:
# Function to strip namespace from tag
def strip_namespace(tag):
    return tag.split('}')[-1] if '}' in tag else tag

# Travers the DOM tree recursively
def traverse_tree(element, meeting_dict, update_id):
    import re
    xmlns='http://www.w3.org/XML/1998/namespace'
    skip_meeting = False

    # Regular expression pattern to match the year (four digits)
    pattern = r'\b\d{4}\b'
    
    # TODO - save elements we want to save else continue the traverse
    if strip_namespace(element.tag) == 'meeting':
            
            if element.text is not None: # sanity check
                if element.attrib.get(f'{{{xmlns}}}lang') == 'de':
                    update_id[0] +=1
                    
                    # extract year
                    match = re.search(pattern, element.text)

                    if match:
                        year = match.group()
                        
                        # Initialize new meeting
                        meeting_dict['id'].append(update_id[0])
                        meeting_dict['year'].append(year)
                        meeting_dict['title'].append(element.text)
                        meeting_dict['speakers'].append([])

                        skip_meeting = False
                    else: 
                        skip_meeting = True


    if strip_namespace(element.tag) == 'note' and not skip_meeting:
         
         if element.text is not None: # sanity check
              if element.attrib.get(f'type') == 'speaker':
                   
                   # update list of spekers
                   meeting_dict['speakers'][-1].append(element.text)
                   
    for child in element:
        traverse_tree(child, meeting_dict, update_id)


In [65]:
update_id = [0]

meeting_dict = {
    'id': [],
    'title': [],
    'year': [],
    'speakers': []
}

traverse_tree(root, meeting_dict, update_id)  

## Sanity check and Post processing

We check that we have one row per meeting

In [66]:
print('Numb of Meetings', 
      len(meeting_dict['id']), 
      len(meeting_dict['year']),
      len(meeting_dict['title']), 
      len(meeting_dict['speakers'])
)

Numb of Meetings 667 667 667 667


In [57]:
meeting_dict['id']

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

sanity check years

In [67]:
set(meeting_dict['year'])

{'1861',
 '1863',
 '1864',
 '1865',
 '1866',
 '1867',
 '1868',
 '1869',
 '1870',
 '1871',
 '1872',
 '1873',
 '1874',
 '1875',
 '1876',
 '1877',
 '1878',
 '1880',
 '1881',
 '1882',
 '1883',
 '1884',
 '1885',
 '1886',
 '1887',
 '1888',
 '1889',
 '1890',
 '1892',
 '1893',
 '1894',
 '1895',
 '1896',
 '1897',
 '1898',
 '1899',
 '1900',
 '1901',
 '1902',
 '1903',
 '1904',
 '1905',
 '1906',
 '1908',
 '1909',
 '1910',
 '1911',
 '1912',
 '1913',
 '1918'}

Count how often a speaker speaks

In [68]:
meeting_dict['title']

['Stenographischer Bericht der  ersten Sitzung des Landtages zn Laibach  am 6. April 1861.',
 'Stenographischer Bericht der zweiten Sitzung des Landtages zu Laibach am 8. April 1861.',
 '26 Stenographischer Bericht der dritten Sitzung -es Landtages zu Laibach am 10. April 1861.',
 '36 Stenographischer Bericht  der  vierte» Sitzung des Landtages zu Laibach  am 11. April 1861.',
 '39 Stenographischer Bericht der fünften Sitzung des Landtages zu Laibach am 13. April 1861.',
 '53 Stenographischer Bericht  der  sechsten Sitzung des Landtages zu Laibach  am 15. April 1861.',
 '71 Stenographischer Bericht der siebenten Sitzung des Landtages zu Laibach am 16. April 1861.',
 '84 Stenographischer Bericht  der  achten Sitzung des Landtages zu Laibach  am 17. April 1861.',
 '97 Stenographischer Bericht  der  neunten Sitzung des Landtages zu Laibach  am 20. April 1861.',
 'der ersten Sitzung des Landtages zu Laibach am 8. Zämcr 1863.',
 'der zweiten Sitzung des Landtages zu Laibach ant 10. Miner 18

In [52]:
meeting_dict['speakers'][1]


from collections import Counter

speakers_counter = []
for meeting in meeting_dict['speakers']:
    speakers_counter.append(dict(Counter(meeting)))

meeting_dict['speakers'] = speakers_counter

display(len(meeting_dict['speakers'][1]))


30

In [69]:
import json

data = meeting_dict

# Define the filename for the JSON file
filename = "../cahe/out.json"

# Write the dictionary to a JSON file
with open(filename, "w") as json_file:
    json.dump(data, json_file)

print("Data saved to", filename)

FileNotFoundError: [Errno 2] No such file or directory: '../cahe/out.json'