In [31]:
import xml.etree.cElementTree as et
import pandas as pd
import os
import re
from tqdm import tqdm_notebook

## Notarieel archief

In [None]:
df_cols = ["uuid", "rubriek", "notaris", "inventarisNr", "akteNr", "akteType", 
           "datering", "taal", "beschrijving", "namen", 'urls']
df = pd.DataFrame(columns = df_cols)
for file in ["SAA_Index_op_notarieel_archief_20191105_001", 
             "SAA_Index_op_notarieel_archief_20191105_002",
             "SAA_Index_op_notarieel_archief_20191105_003",
             "SAA_Index_op_notarieel_archief_20191105_004",
             "SAA_Index_op_notarieel_archief_20191105_005",
             "SAA_Index_op_notarieel_archief_20191105_006",
             "SAA_Index_op_notarieel_archief_20191105_007",
             "SAA_Index_op_notarieel_archief_20191105_008",
             "SAA_Index_op_notarieel_archief_20191105_009",
             "SAA_Index_op_notarieel_archief_20191105_010"]:
    parsedXML = et.parse("notarieel/" + file + ".xml")
    root = parsedXML.getroot()
    rows = []

    for node in root: 
        if node is not None:
            s_uuid = node.find("uuid").text if node.find("uuid") is not None else None
            s_rubriek = node.find("rubriek").text if node.find("rubriek") is not None else None
            s_notaris = node.find("notaris").text if node.find("notaris") is not None else None
            s_inventarisNr = node.find("inventarisNr").text if node.find("inventarisNr") is not None else None
            s_akteNr= node.find("akteNr").text if node.find("akteNr") is not None else None
            s_akteType = node.find("akteType").text if node.find("akteType") is not None else None
            s_datering = node.find("datering").text if node.find("datering") is not None else None
            s_taal = node.find("taal").text if node.find("taal") is not None else None
            s_beschrijving = node.find("beschrijving").text if node.find("beschrijving") is not None else None
            s_namen = []
            if node.find("persoonsnamen") is not None:
                namen = []
                for x in node.find("persoonsnamen"):
                    voornaam = x.find("voornaam").text if x.find("voornaam") is not None else None
                    tussenvoegsel = x.find("tussenvoegsel").text if x.find("tussenvoegsel") is not None else None
                    achternaam = x.find("achternaam").text if x.find("achternaam") is not None else None
                    scanNaam = x.find("scanNaam").text if x.find("scanNaam") is not None else None
                    scanPositie = x.find("scanPositie").text if x.find("scanPositie") is not None else None
                    uuidNaam = x.find("uuidNaam").text if x.find("uuidNaam") is not None else None

                    namen.append({"voornaam": voornaam, "tussenvoegsel": tussenvoegsel, 
                         "achternaam": achternaam, "scanNaam": scanNaam, 
                         "scanPositie": scanPositie, "uuidNaam": uuidNaam})
            else:
                namen = None
            if node.find("urlScans") is not None:
                urls = []
                for x in node.find("urlScans"):
                    urlScan = x.text if x is not None else None
                    if urlScan != None:
                        urls.append(re.search(r'#.+', urlScan).group(0)[1:])
            else:
                urls = None
        else:
            s_uuid = None
            s_rubriek = None
            s_notaris = None
            s_inventarisNr = None
            s_akteNr= None
            s_akteType = None
            s_datering = None
            s_taal = None
            s_beschrijving = None
        rows.append({"uuid": s_uuid, "rubriek": s_rubriek, 
                     "notaris": s_notaris, "inventarisNr": s_inventarisNr, 
                     "akteNr": s_akteNr, "akteType": s_akteType,
                     "datering": s_datering, "taal": s_taal,
                     "beschrijving": s_beschrijving, "namen": namen, "urls":urls})

    out_df = pd.DataFrame(rows, columns = df_cols)
    df = df.append(out_df)
    df = df.reset_index(drop=True)


In [None]:
df

## Scanned Data

In [6]:
def namespace(element):
    m = re.match(r'\{.*\}', element.tag)
    return m.group(0) if m else ''

In [7]:
files = []
directory = 'D:\Documenten\Studie\Master\Scriptie\scanned\page'
for filename in os.listdir(directory):
    if filename.endswith(".xml"):
        files.append(filename)
    else:
        continue

In [41]:
df_cols = ["page", "textregion"]
scan_df = pd.DataFrame(columns = df_cols)
for file in tqdm_notebook(files):
    parsedXML = et.parse('D:/Documenten/Studie/Master/Scriptie/scanned/page/' + file)
    root = parsedXML.getroot()
    rows = []
    page = file[:-3] + "JPG"
    text = ""
    for node in root: 
        nspace = namespace(node)
        if node is not None:
            for child in node:
                if child.find(nspace + "TextEquiv") is not None:
                    if child.find(nspace + "TextEquiv").find(nspace + "Unicode").text is not None:
                        text += str(child.find(nspace + "TextEquiv").find(nspace + "Unicode").text)
    if text != "":           
        rows.append({"page": page, "textregion": text })
        out_df = pd.DataFrame(rows, columns = df_cols)
        scan_df = scan_df.append(out_df)
        scan_df = scan_df.reset_index(drop=True)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=43476), HTML(value='')))




## Cleaning data

In [42]:
def cleaner(string):
    string  = re.sub('\\n', ' ', string)
    return string

In [43]:
tqdm_notebook().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [44]:
scan_df.textregion = scan_df.textregion.progress_apply(cleaner)

HBox(children=(IntProgress(value=0, max=43182), HTML(value='')))




In [45]:
def writer(row):
    file = open(str(row['page']) + ".txt", "w") 
    file.write(row['textregion']) 
    file.close() 

In [49]:
os.chdir('../text_files')
scan_df.progress_apply(writer, axis=1)

HBox(children=(IntProgress(value=0, max=43182), HTML(value='')))




0        None
1        None
2        None
3        None
4        None
         ... 
43177    None
43178    None
43179    None
43180    None
43181    None
Length: 43182, dtype: object