# Validating finished Shawi Corpus Files

* Install & set up dependencies
* extract schematron from RNG and transform to XSLT
* run schematron-xslt on files

In [1]:
import io
import os
import requests
import pathlib
import re
import sys
import json
import pandas as pd
import linecache as lc
import shutil

from copy import deepcopy
from pathlib import Path
from urllib.parse import urlsplit
import saxonche
from zipfile import ZipFile
from lxml import isoschematron, etree

In [2]:
tmpDir = "tmp"
libDir = "lib"
os.makedirs(tmpDir, exist_ok=True)
os.makedirs(libDir, exist_ok=True)
namespaces = {"tei":"http://www.tei-c.org/ns/1.0"}
dataHome = ".."

# rng schema
rngSchema = dataHome + "/802_tei_odd/out/shawi_corpus.rng"
# dict rng schema
dictRngSchema = dataHome + "/802_tei_odd/out/shawi_dict.rng"
# the path to the annotated TEI transcription files
manannot = dataHome + "/010_manannot"
# the path to the dictionary
dictionary = dataHome + "/vicav_dicts/dc_shawi_eng.xml"
dictionary_draft_and_released = dataHome + "/080_scripts_generic/tmp/dc_shawi_eng_draft_and_released.xml"

# include those documents in validation which have the following status in //tei:revisionDesc/@status / ignore all others
VALIDATE_DOCS_WITH_STATUS = "done"

with saxonche.PySaxonProcessor(license=False) as proc:
    print(proc.version)
    proc.set_cwd(os.path.dirname(os.path.dirname(os.path.abspath(''))))
    print(proc.cwd)

SaxonC-HE 12.4 from Saxonica
C:\Users\mrauschsupola\Shawi


In [3]:
def downloadAndStore(url, force=False):   
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    dlFilePath = tmpDir + "/" + fn
    if not os.path.exists(dlFilePath) and not force == True:
        payload = requests.get(url).content
        open(dlFilePath, 'wb').write(payload)
    return dlFilePath

In [4]:
def downloadAndUnzip(url):    
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    
    if ext != ".zip":
        return "not a zip archive"
    else:
        zipFilePath = downloadAndStore(url)
        # the path where the content should be extracted to
        targetPath = libDir + "/" + basename
        
        
        payload = requests.get(url).content
        open(zipFilePath, 'wb').write(payload)
        ZipFile(zipFilePath).extractall(path=targetPath)
    
    return targetPath

In [5]:
def transform(s, xsl, o, parameters=[]):
    # processor keeps files open on Windows and in doing so prevents moving or copying them
    try:
        with saxonche.PySaxonProcessor(license=False) as proc:
            proc.set_configuration_property("xi", "on")
            saxon = proc.new_xslt30_processor()
            for i in parameters:
                saxon.set_parameter(name=i, value=proc.make_string_value(parameters[i]))
            exec = saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
            exec.apply_templates_returning_file(source_file=os.path.abspath(s), output_file=os.path.abspath(o))
            if exec.exception_occurred:
                exec.get_error_message
                #for i in range(saxon.exception_count()-1):
                print(saxon.get_error_message())
                print(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
            if os.path.exists(os.path.abspath(o)):
                return o
            else: 
                print("there was an error transforming "+s+" with stylesheet "+xsl)
    except Exception as e:
        print("Python Exception during transform:", e)

### Install XSL based schematron validator

In [6]:
_schCompiler = None
def setupSchXSLT():
    global _schCompiler
    if _schCompiler is not None:
        return _schCompiler
    schDLURL = "https://codeberg.org/SchXslt/schxslt/releases/download/v1.10.1/schxslt-1.10.1-xslt-only.zip"
    schHome = downloadAndUnzip(schDLURL)
    _schCompiler = schHome + "/schxslt-1.10.1/2.0/pipeline-for-svrl.xsl"
    if os.path.exists(_schCompiler):
        return _schCompiler
    else: 
        print("error: something went wrong, cannot locate file '" + schCompiler + "'")

In [7]:
setupSchXSLT()

'lib/schxslt-1.10.1-xslt-only/schxslt-1.10.1/2.0/pipeline-for-svrl.xsl'

## Prepare rng2sch stylesheet

Returns path to the xsl that extracts schematron form the RelaxNG schema.
This should only run once as the file gets locked (by saxon) and so further attempts to pring it to the correct location will fail.

In [8]:
_rng2sch = None 
def setupRNG2Sch():
    global _rng2sch
    if _rng2sch is not None:
        return _rng2sch
    RNG2SchtrDL = "https://raw.githubusercontent.com/Schematron/schematron/master/trunk/converters/code/ToSchematron/ExtractSchFromRNG.xsl"
    dltmp = downloadAndStore(RNG2SchtrDL)
    # tweak XSLT 
    with open(dltmp, encoding='utf-8') as inputfile:
        lines = inputfile.read()
    lines = lines.replace( 'http://www.ascc.net/xml/schematron','http://purl.oclc.org/dsdl/schematron/')
    lines = lines.replace( '<sch:schema','<sch:schema queryBinding="xslt2"')
    
    with open(dltmp, 'w', encoding='utf-8') as file:
        file.writelines(lines)
    _rng2sch = libDir+"/"+os.path.basename(dltmp)
    os.replace(dltmp, _rng2sch)
    if os.path.exists(_rng2sch):
        return _rng2sch
    else:
        print("error: something went wrong, cannot locate file '" + newPath + "'")

In [9]:
setupRNG2Sch()

'lib/ExtractSchFromRNG.xsl'

## Extract schematron from RNG and transform to XSLT

In [10]:
def extractSchematron(rng):
    """extracts a schematron document embedded in an rng schema"""
    print("extracting Schematron document from "+rng)
    rng2sch = setupRNG2Sch()
    sch = tmpDir + "/" + os.path.basename(rng) + ".sch"
    if not os.path.exists(sch):
        transform(rng, rng2sch, sch)
    return sch

In [11]:
def compileSchematron(sch):
    """compiles a schematron document to an XSLT stylesheet"""
    outputPath = tmpDir + "/" + os.path.basename(sch) + ".xsl"
    schCompiler = setupSchXSLT()
    transform(sch, schCompiler, outputPath)
    
    if os.path.exists(outputPath):
        print(outputPath)
        return outputPath
    else: 
        print("error: something went wrong, cannot locate file '" + outputPath + "'")

In [12]:
sch = extractSchematron(rngSchema)
schXSL = compileSchematron(sch)
dictSch = extractSchematron(dictRngSchema)
dictSchXSL = compileSchematron(dictSch)

extracting Schematron document from ../802_tei_odd/out/shawi_corpus.rng
tmp/shawi_corpus.rng.sch.xsl
extracting Schematron document from ../802_tei_odd/out/shawi_dict.rng
tmp/shawi_dict.rng.sch.xsl


## Run schematron and relaxNG on files

In [13]:
validationErrors = []
ignoredFiles = []
dictValidationErrors = []

In [14]:
def schValidate(schXSL, path):
    print("path: ", path)
    """validates a document (at path) against an XSL compiled schematron schema (at schXSL)"""
    errs = []
    out = tmpDir + "/validationReports/" + os.path.basename(path)

    try:
        transform(path, schXSL, out)

    except saxonche.PySaxonApiError as e:
        # We swallow parsing errors here as they should have been reported by lxml already
        print("an error occured while running schValidate on "+path)
        return []
    
    report = etree.parse(out)
    successfulReport = report.findall("{http://purl.oclc.org/dsdl/svrl}successful-report")
    failedAssert = report.findall("{http://purl.oclc.org/dsdl/svrl}failed-assert")
    doc = etree.parse(path)
    for s in successfulReport + failedAssert:
        XPath = s.attrib['location'].replace('Q{http://www.tei-c.org/ns/1.0}','tei:').replace('Q{}','')
        entry_xpath = f"{XPath}/ancestor::tei:entry[1]"
        if entry_xpath is not None and doc.xpath(entry_xpath, namespaces=namespaces):
            entry = doc.xpath(entry_xpath, namespaces=namespaces)[0] 
            entry_id = entry.get("{http://www.w3.org/XML/1998/namespace}id")
            rel_path = XPath.split("tei:entry")[1].split("]", 1)[1].lstrip("/")
        else:
            entry_id = ""
            rel_path = ""
        # DEBUG
        node = doc.xpath(XPath, namespaces=namespaces)[0]
        if not(path.startswith("../010_manannot")):
            node_with_line = tree_with_lines.xpath(XPath, namespaces=namespaces)[0]
            line_number = node_with_line.attrib.get("_line")
            path = "../vicav_dicts/dc_shawi_eng.xml"
        else:
            line_number = None
        if type(node) is etree._ElementUnicodeResult:
            elt = node.getparent()
        else:
            elt = node

        msg = s.find("{http://purl.oclc.org/dsdl/svrl}text").text

        errObj = {
            "type" : "error",
            "message":  msg,
            "line" : line_number or elt.sourceline,
            "source": path,
            "entry": entry_id,
            "relativePath": rel_path,
            "location": XPath,
            #"text": elt.text, #checking text values (for @lemmaRef)
            "stage": "schematron",
            "exceptionType": str(s.tag).replace("{http://purl.oclc.org/dsdl/svrl}",""),
        }
        
        errs.append(errObj)
    return errs
    

In [15]:
def validate(path, rngSchema, schematronSchemaXSL):
    """Validate a document against the rngSchema. Returns a list of dicts of which each one represents a validation (or parsing) error."""
    validationErrors = []
    shutil.copyfile(sch, '../802_tei_odd/out/'+os.path.basename(sch)) #copy file from tmp folder into odd folder to be able to git it
    try:
        doc = etree.parse(path)
    
        # relaxng validation
        relaxng_doc = etree.parse(rngSchema)
        relaxng = etree.RelaxNG(relaxng_doc)
        relaxng.assertValid(doc)
        
        # schematron validation
        schErrs = schValidate(schematronSchemaXSL, path)
        if len(schErrs) >= 1:
            validationErrors = validationErrors + schErrs
    
    except etree.XMLSyntaxError as e:
        valErrObj = {
            "type" : "error",
            "message": str(e), 
            "line": e.lineno,
            "source": path, 
            "location": "n/a",
            "stage" : "parsing", 
            "exceptionType": type(e).__name__
        }
        return valErrObj
        
    except etree.DocumentInvalid as e:
        for error in e.error_log:
            # we ignore rng errors about @schemaLocation since 
            # that is needed for validation in the TEI-enricher
            if error.message != "Invalid attribute schemaLocation for element TEI":
                location = "n/a" if error.path is None else error.path
                if not(path.startswith("../010_manannot")):
                    curr_path = "../vicav_dicts/dc_shawi_eng.xml"
                    entry_xpath = f"{error.path}/ancestor::tei:entry[1]"
                    if entry_xpath is not None and doc.xpath(entry_xpath, namespaces=namespaces):
                        entry = doc.xpath(entry_xpath, namespaces=namespaces)[0] 
                        entry_id = entry.get("{http://www.w3.org/XML/1998/namespace}id")
                    else:
                        entry_id = ""
                else:
                    curr_path = None
                    entry_id = None

                valErrObj = {
                    "type" : "error",
                    "message": error.message, 
                    "line": error.line, 
                    "source": curr_path or path,
                    "entry": entry_id,
                    "relativePath": None,
                    "location": location,
                    "stage" : "relaxng", 
                    "exceptionType": type(e).__name__
                }
                # DEBUG
                print(valErrObj)
                validationErrors.append(valErrObj)
        
        # if the document is invalid against the RNG, we still want to run schematron against it
        schErrs = schValidate(schematronSchemaXSL, path)
        if len(schErrs) >= 1:
            validationErrors = validationErrors + schErrs
        
        
    
    return validationErrors

In [16]:
def docStatus(path):
    """returns the status of the document at path; if the document can't be parsed, it returns a dict with the error"""
    try:
        doc = etree.parse(path)
        revisionDesc = doc.xpath("/tei:TEI/tei:teiHeader/tei:revisionDesc", namespaces=namespaces)[0]
        status = revisionDesc.attrib['status']
        return status
    # report documents which are not well-formed
    except etree.XMLSyntaxError as e:
        print(e)

In [17]:
# remove all entries where the last fs element contains the status stub

In [18]:
def indent(tree):
    teiHeader = tree.find('.//tei:teiHeader', namespaces=namespaces)
    teiText = tree.find('.//tei:text', namespaces=namespaces)
    etree.indent(teiHeader, space='   ', level=0)
    etree.indent(teiText, space='   ', level=0)

In [19]:
def get_dict_with_draft_and_released():
    with open(dictionary, encoding="utf8") as file:
        tree = etree.parse(file)

    for line in tree.iter():
        line.set("_line", str(line.sourceline))
    
    entries = tree.findall(f'.//tei:entry', namespaces=namespaces)
    for entry in entries:
        fss = entry.findall(f'.//tei:fs', namespaces=namespaces)
        if not fss:
            #print(f"No fs elements in entry {entry.attrib}")
            continue
        
        symbol = fss[-1].find(f'tei:f[@name="status"]/tei:symbol', namespaces=namespaces)
        if symbol is None:
            #print(f'no f name status found for {entry.attrib}')
            continue

        status = symbol.get("value")
        
        if status != "draft" and status != "released":
            entry.getparent().remove(entry)
            
    # copy the tree for line number retrieval, causing no errors because of surplus attribute "_line"
    tree_with_lines = deepcopy(tree)
    
    for el in tree.iter():
        el.attrib.pop("_line", None)
        
    indent(tree)
    
    with open(dictionary_draft_and_released, 'wb') as file:
        file.write(etree.tostring(tree, pretty_print=True, encoding='UTF-8'))
        
    return tree_with_lines

In [20]:
def validateAndAppend(filepath, rngSchema, schXSL, is_dict_validation):
    global validationErrors, ignoredFiles, dictValidationErrors
    print("validating " + filepath)
    results = validate(filepath, rngSchema, schXSL)
    #print(results)
    len(results)
    if type(results) is list:
        res_errs = filter(lambda x: x['type'] == "error", results)
        res_ignored = filter(lambda x: x['type'] == "ignored", results)
        if is_dict_validation is not None:
            dictValidationErrors = dictValidationErrors + list(res_errs)
            print(f"{len(list(res_errs))} found / {len(dictValidationErrors)} in total")
        else:
            validationErrors = validationErrors + list(res_errs)
            print(f"{len(list(res_errs))} found / {len(validationErrors)} in total")
    elif type(results) is dict:
        if results['type'] == "ignored":
            ignoredFiles.append(results)
    else:
        print("unknown result type")
        print(results)

In [21]:
tree_with_lines = get_dict_with_draft_and_released()

In [22]:
validateAndAppend(dictionary_draft_and_released, dictRngSchema, dictSchXSL, True)

validating ../080_scripts_generic/tmp/dc_shawi_eng_draft_and_released.xml
{'type': 'error', 'message': 'Element form has extra content: usg', 'line': 203, 'source': '../vicav_dicts/dc_shawi_eng.xml', 'entry': 'DShaAr.sid_1510', 'relativePath': None, 'location': '/*/*[2]/*/*[1]/*[3]/*[1]/*[1]', 'stage': 'relaxng', 'exceptionType': 'DocumentInvalid'}
{'type': 'error', 'message': 'Element div has extra content: entry', 'line': 96, 'source': '../vicav_dicts/dc_shawi_eng.xml', 'entry': '', 'relativePath': None, 'location': '/*/*[2]/*/*[1]/*[2]', 'stage': 'relaxng', 'exceptionType': 'DocumentInvalid'}
path:  ../080_scripts_generic/tmp/dc_shawi_eng_draft_and_released.xml
0 found / 1630 in total


In [None]:
for i in os.scandir(manannot):
    if i.name.startswith('Urfa') and i.name.endswith('.xml') and i.is_file():
        filename = os.path.basename(i)
        filepath = manannot + "/" + filename
        status = docStatus(filepath)
        print(filename, status)
        
        # if the document is not finished yet (//tei:revisionDesc/@status != DOCSTATUS_DONE), just ignore it
        if type(status) is str and status != VALIDATE_DOCS_WITH_STATUS:
            ignoredFiles.append({
                "source" : filepath,
                "type" : "ignored",
                "status": status
            })
        # if the document couldn't be parsed, docStatus() returns a dict 
        # with some error information which is appended to the list of 
        # validation errors
        elif type(status) is dict and status["type"] == "error":
            validationErrors.append(status) 
        
        # â€¦ otherwise try to validate the document
        else:
            validateAndAppend(filepath, rngSchema, schXSL, None)      

Urfa-000_Namrud-Harran-2001.xml done
validating ../010_manannot/Urfa-000_Namrud-Harran-2001.xml
path:  ../010_manannot/Urfa-000_Namrud-Harran-2001.xml
0 found / 0 in total
Urfa-002a_Joke_about_a_tribe.xml done
validating ../010_manannot/Urfa-002a_Joke_about_a_tribe.xml
path:  ../010_manannot/Urfa-002a_Joke_about_a_tribe.xml
0 found / 0 in total
Urfa-002b_Blood_feud_in_the_past.xml generated
Urfa-002c_The_incomplete_meal.xml done
validating ../010_manannot/Urfa-002c_The_incomplete_meal.xml
path:  ../010_manannot/Urfa-002c_The_incomplete_meal.xml
0 found / 0 in total
Urfa-002d_The_bull_in_the_jar.xml done
validating ../010_manannot/Urfa-002d_The_bull_in_the_jar.xml
path:  ../010_manannot/Urfa-002d_The_bull_in_the_jar.xml
0 found / 0 in total
Urfa-011_Cemetry-Harran-2010.xml done
validating ../010_manannot/Urfa-011_Cemetry-Harran-2010.xml
path:  ../010_manannot/Urfa-011_Cemetry-Harran-2010.xml
0 found / 0 in total
Urfa-012_Lentils-Harran-2010.xml done
validating ../010_manannot/Urfa-012_L

In [None]:
# validationErrors

In [None]:
validation_css_begin = """
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">

<style>
table {
    font-family: Arial, sans-serif;
    font-size: 13px;
    border-collapse: collapse;
    width: 100%;
}
th, td {
    border: 1px solid #ccc;
    padding: 6px;
    text-align: left;
}
thead {
    background: #eee;
}
</style>

</head>
<script>
function filterTable() {
    const input = document.getElementById("tableFilter");
    const filter = input.value.toLowerCase();
    const table = document.getElementById("myTable");
    const rows = table.getElementsByTagName("tr");

    for (let i = 1; i < rows.length; i++) {  // skip header (row 0)
        let rowText = rows[i].innerText.toLowerCase();
        rows[i].style.display = rowText.includes(filter) ? "" : "none";
    }
}
</script>
<body>
<input
    type="text"
    id="tableFilter"
    placeholder="Type to filter..."
    onkeyup="filterTable()"
    style="
        padding: 5px 10px;
        margin-bottom: 5px;
        width: 100%;
        max-width: 400px;
        font-size: 14px;
        border: 1px solid #ccc;
        border-radius: 8px;
        box-shadow: 1px 1px 4px rgba(0,0,0,0.1);
        outline: none;
        transition: border 0.2s;
    "
    onfocus="this.style.border='1px solid #007acc';"
    onblur="this.style.border='1px solid #ccc';"
/>
"""

In [None]:
validation_css_end = """
</body>
</html>
"""

In [None]:
def make_clickable(source, line=None):
    link = source.replace('../','https://github.dev/acdh-oeaw/shawi-data/blob/main/') if source.endswith("dc_shawi_eng.xml") else source.replace('../','https://github.com/acdh-oeaw/shawi-data/tree/main/')
    if line:
        return f'<a href="{link}#L{line}">{source}</a>'
    else:
        return f'<a href="{link}">{source}</a>'

In [None]:
if len(ignoredFiles) > 0:
    df_ignored = pd.DataFrame(data=ignoredFiles).T
    df_ignored = df_ignored.transpose()
    df_ignored
    ignoredReport = "tmp/ignoredFiles.html"
    df_ignored['link'] = df_ignored.apply(lambda x: make_clickable(x['source']), axis=1)

    with open(ignoredReport, 'w', encoding='utf-8') as f:
        f.write(df_ignored.to_html(render_links=True, escape=False))

In [None]:
if len(validationErrors) > 0:
    df_err = pd.DataFrame(data=validationErrors).T
    df_err = df_err.transpose()
    print(f"found {len(validationErrors)} validation errors")
    errorReport = "tmp/validationReport.html"
    df_err['link'] = df_err.apply(lambda x: make_clickable(x['source'], x['line']), axis=1)
    if "entry" in df_err.columns:
        df_err.drop(columns=["entry"], inplace=True)
    if "relativePath" in df_err.columns:
        df_err.drop(columns=["relativePath"], inplace=True)
    with open(errorReport, 'w', encoding='utf-8') as f:
        f.write(validation_css_begin)
        f.write(df_err.to_html(render_links=True, escape=False, table_id="myTable"))
        f.write(validation_css_end)

In [None]:
#df_err

In [None]:
if len(dictValidationErrors) > 0:
    df_err = pd.DataFrame(data=dictValidationErrors).T
    df_err = df_err.transpose()
    print(f"found {len(dictValidationErrors)} dictionary validation errors")
    errorReport = "tmp/dictValidationReport.html"
    df_err['link'] = df_err.apply(lambda x: make_clickable(x['source'], x['line']), axis=1)
    if "source" in df_err.columns:
        df_err.drop(columns=["source"], inplace=True)
    with open(errorReport, 'w', encoding='utf-8') as f:
        f.write(validation_css_begin)
        f.write(df_err.to_html(render_links=True, escape=False, table_id="myTable"))
        f.write(validation_css_end)

### create and style html page

In [None]:
index_css = """
    <style>
        body {
            font-family: Arial, sans-serif;
            background: #f4f4f4;
            padding: 40px;
        }
        .card {
            background: #ffffff;
            padding: 20px;
            margin: 15px 0;
            border-radius: 10px;
            box-shadow: 0px 2px 6px rgba(0,0,0,0.1);
            transition: 0.2s;
        }
        .card:hover {
            transform: scale(1.02);
        }
        .card-link {
            text-decoration: none;
            color: #333;
        }
        .card-link h2 {
            margin: 0;
        }
    </style>
"""

In [None]:
pages = [
    ("manannot_validation.html", "manannot"),
    ("dict_validation.html", "dictionary")
]

html = """
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>Index</title>
"""
html += index_css
html += """
</head>
<body>
    <h1>Shawi Validation Reports</h1>
    <ul>
"""

for filename, title in pages:
    html += f'''
        <div class="card">
            <a class="card-link" href="{filename}">
                <h2>{title}</h2>
            </a>
        </div>
    '''

html += """    </ul>
</body>
</html>
"""

with open("tmp/index.html", "w") as f:
    f.write(html)