# Validating finished Shawi Corpus Files

* Install & set up dependencies
* extract schematron from RNG and transform to XSLT
* run schematron-xslt on files

In [1]:
import io
import os
import requests
import pathlib
import re
import sys
import json
import pandas as pd
import linecache as lc
import shutil

from pathlib import Path
from urllib.parse import urlsplit
import saxonche
from zipfile import ZipFile
from lxml import isoschematron, etree

In [2]:
tmpDir = "tmp"
libDir = "lib"
os.makedirs(tmpDir, exist_ok=True)
os.makedirs(libDir, exist_ok=True)
nss = {"tei":"http://www.tei-c.org/ns/1.0"}
dataHome = ".."

# rng schema
rngSchema = dataHome + "/802_tei_odd/out/shawi_corpus.rng"
# dict rng schema
dictRngSchema = dataHome + "/802_tei_odd/out/shawi_dict.rng"
# the path to the annotated TEI transcription files
manannot = dataHome + "/010_manannot"
# the path to the dictionary
dictionary = dataHome + "/vicav_dicts/dc_shawi_eng.xml"

# include those documents in validation which have the following status in //tei:revisionDesc/@status / ignore all others
VALIDATE_DOCS_WITH_STATUS = "done"

with saxonche.PySaxonProcessor(license=False) as proc:
    print(proc.version)
    proc.set_cwd(os.path.dirname(os.path.dirname(os.path.abspath(''))))
    print(proc.cwd)

SaxonC-HE 12.9 from Saxonica
Q:\basexshawi


In [3]:
def downloadAndStore(url, force=False):   
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    dlFilePath = tmpDir + "/" + fn
    if not os.path.exists(dlFilePath) and not force == True:
        payload = requests.get(url).content
        open(dlFilePath, 'wb').write(payload)
    return dlFilePath

In [4]:
def downloadAndUnzip(url):    
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    
    if ext != ".zip":
        return "not a zip archive"
    else:
        zipFilePath = downloadAndStore(url)
        # the path where the content should be extracted to
        targetPath = libDir + "/" + basename
        
        
        payload = requests.get(url).content
        open(zipFilePath, 'wb').write(payload)
        ZipFile(zipFilePath).extractall(path=targetPath)
    
    return targetPath

In [5]:
def transform(s, xsl, o, parameters=[]):
    # processor keeps files open on Windows and in doing so prevents moving or copying them
    try:
        with saxonche.PySaxonProcessor(license=False) as proc:
            proc.set_configuration_property("xi", "on")
            saxon = proc.new_xslt30_processor()
            for i in parameters:
                saxon.set_parameter(name=i, value=proc.make_string_value(parameters[i]))
            exec = saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
            exec.apply_templates_returning_file(source_file=os.path.abspath(s), output_file=os.path.abspath(o))
            if exec.exception_occurred:
                exec.get_error_message
                #for i in range(saxon.exception_count()-1):
                print(saxon.get_error_message())
                print(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
            if os.path.exists(os.path.abspath(o)):
                return o
            else: 
                print("there was an error transforming "+s+" with stylesheet "+xsl)
    except Exception as e:
        print("Python Exception during transform:", e)

### Install XSL based schematron validator

In [6]:
_schCompiler = None
def setupSchXSLT():
    global _schCompiler
    if _schCompiler is not None:
        return _schCompiler
    schDLURL = "https://codeberg.org/SchXslt/schxslt/releases/download/v1.10.1/schxslt-1.10.1-xslt-only.zip"
    schHome = downloadAndUnzip(schDLURL)
    _schCompiler = schHome + "/schxslt-1.10.1/2.0/pipeline-for-svrl.xsl"
    if os.path.exists(_schCompiler):
        return _schCompiler
    else: 
        print("error: something went wrong, cannot locate file '" + schCompiler + "'")

In [7]:
setupSchXSLT()

'lib/schxslt-1.10.1-xslt-only/schxslt-1.10.1/2.0/pipeline-for-svrl.xsl'

## Prepare rng2sch stylesheet

Returns path to the xsl that extracts schematron form the RelaxNG schema.
This should only run once as the file gets locked (by saxon) and so further attempts to pring it to the correct location will fail.

In [8]:
_rng2sch = None 
def setupRNG2Sch():
    global _rng2sch
    if _rng2sch is not None:
        return _rng2sch
    RNG2SchtrDL = "https://raw.githubusercontent.com/Schematron/schematron/master/trunk/converters/code/ToSchematron/ExtractSchFromRNG.xsl"
    dltmp = downloadAndStore(RNG2SchtrDL)
    # tweak XSLT 
    with open(dltmp, encoding='utf-8') as inputfile:
        lines = inputfile.read()
    lines = lines.replace( 'http://www.ascc.net/xml/schematron','http://purl.oclc.org/dsdl/schematron/')
    lines = lines.replace( '<sch:schema','<sch:schema queryBinding="xslt2"')
    
    with open(dltmp, 'w', encoding='utf-8') as file:
        file.writelines(lines)
    _rng2sch = libDir+"/"+os.path.basename(dltmp)
    os.replace(dltmp, _rng2sch)
    if os.path.exists(_rng2sch):
        return _rng2sch
    else:
        print("error: something went wrong, cannot locate file '" + newPath + "'")

In [9]:
setupRNG2Sch()

'lib/ExtractSchFromRNG.xsl'

## Extract schematron from RNG and transform to XSLT

In [10]:
def extractSchematron(rng):
    """extracts a schematron document embedded in an rng schema"""
    print("extracting Schematron document from "+rng)
    rng2sch = setupRNG2Sch()
    sch = tmpDir + "/" + os.path.basename(rng) + ".sch"
    if not os.path.exists(sch):
        transform(rng, rng2sch, sch)
    return sch

In [11]:
def compileSchematron(sch):
    """compiles a schematron document to an XSLT stylesheet"""
    outputPath = tmpDir + "/" + os.path.basename(sch) + ".xsl"
    schCompiler = setupSchXSLT()
    transform(sch, schCompiler, outputPath)
    
    if os.path.exists(outputPath):
        print(outputPath)
        return outputPath
    else: 
        print("error: something went wrong, cannot locate file '" + outputPath + "'")

In [12]:
sch = extractSchematron(rngSchema)
schXSL = compileSchematron(sch)
dictSch = extractSchematron(dictRngSchema)
dictSchXSL = compileSchematron(dictSch)

extracting Schematron document from ../802_tei_odd/out/shawi_corpus.rng
tmp/shawi_corpus.rng.sch.xsl
extracting Schematron document from ../802_tei_odd/out/shawi_dict.rng
tmp/shawi_dict.rng.sch.xsl


## Run schematron and relaxNG on files

In [13]:
validationErrors = []
ignoredFiles = []

In [14]:
def schValidate(schXSL, path):
    """validates a document (at path) against an XSL compiled schematron schema (at schXSL)"""
    errs = []
    out = tmpDir + "/validationReports/" + os.path.basename(path)
    try:
        transform(path, schXSL, out)

    except saxonche.PySaxonApiError as e:
        # We swallow parsing errors here as they should have been reported by lxml already
        print("an error occured while running schValidate on "+path)
        return []
    
    report = etree.parse(out)
    successfulReport = report.findall("{http://purl.oclc.org/dsdl/svrl}successful-report")
    failedAssert = report.findall("{http://purl.oclc.org/dsdl/svrl}failed-assert")
    doc = etree.parse(path)
    for s in successfulReport + failedAssert:
        XPath = s.attrib['location'].replace('Q{http://www.tei-c.org/ns/1.0}','tei:').replace('Q{}','')
        # DEBUG
        node = doc.xpath(XPath, namespaces=nss)[0]
        if type(node) is etree._ElementUnicodeResult:
            elt = node.getparent()
        else:
            elt = node

        msg = s.find("{http://purl.oclc.org/dsdl/svrl}text").text

        errObj = {
            "type" : "error",
            "message":  msg,
            "line" : elt.sourceline,
            "source": path,
            "location": XPath,
            #"text": elt.text, #checking text values (for @lemmaRef)
            "stage": "schematron",
            "exceptionType": str(s.tag).replace("{http://purl.oclc.org/dsdl/svrl}",""),
        }
        
        errs.append(errObj)
    return errs
    

In [15]:
def validate(path, rngSchema, schematronSchemaXSL):
    """Validate a document against the rngSchema. Returns a list of dicts of which each one represents a validation (or parsing) error."""
    validationErrors = []
    shutil.copyfile(sch, '../802_tei_odd/out/'+os.path.basename(sch)) #copy file from tmp folder into odd folder to be able to git it
    try:
        doc = etree.parse(path)
    
        # relaxng validation
        relaxng_doc = etree.parse(rngSchema)
        relaxng = etree.RelaxNG(relaxng_doc)
        relaxng.assertValid(doc)
        
        # schematron validation
        schErrs = schValidate(schematronSchemaXSL, path)
        if len(schErrs) >= 1:
            validationErrors = validationErrors + schErrs
    
    except etree.XMLSyntaxError as e:
        valErrObj = {
            "type" : "error",
            "message": str(e), 
            "line": e.lineno,
            "source": path, 
            "location": "n/a",
            "stage" : "parsing", 
            "exceptionType": type(e).__name__
        }
        return valErrObj
        
    except etree.DocumentInvalid as e:
        for error in e.error_log:
            # we ignore rng errors about @schemaLocation since 
            # that is needed for validation in the TEI-enricher
            if error.message != "Invalid attribute schemaLocation for element TEI":
                location = "n/a" if error.path is None else error.path
                valErrObj = {
                    "type" : "error",
                    "message": error.message, 
                    "line": error.line, 
                    "source": path, 
                    "location": location,
                    "stage" : "relaxng", 
                    "exceptionType": type(e).__name__
                }
                # DEBUG
                print(valErrObj)
                validationErrors.append(valErrObj)
        
        # if the document is invalid against the RNG, we still want to run schematron against it
        schErrs = schValidate(schematronSchemaXSL, path)
        if len(schErrs) >= 1:
            validationErrors = validationErrors + schErrs
        
        
    
    return validationErrors

In [16]:
def docStatus(path):
    """returns the status of the document at path; if the document can't be parsed, it returns a dict with the error"""
    try:
        doc = etree.parse(path)
        revisionDesc = doc.xpath("/tei:TEI/tei:teiHeader/tei:revisionDesc", namespaces=nss)[0]
        status = revisionDesc.attrib['status']
        return status
    # report documents which are not well-formed
    except etree.XMLSyntaxError as e:
        print(e)

In [17]:
def validateAndAppend(filepath, rngSchema, schXSL):
    global validationErrors, ignoredFiles
    print("validating " + filepath)
    results = validate(filepath, rngSchema, schXSL)
    print(results)
    len(results)
    if type(results) is list:
        res_errs = filter(lambda x: x['type'] == "error", results)
        res_ignored = filter(lambda x: x['type'] == "ignored", results)
        validationErrors = validationErrors + list(res_errs)
        print(f"{len(list(res_errs))} found / {len(validationErrors)} in total")
    elif type(results) is dict:
        if results['type'] == "ignored":
            ignoredFiles.append(results)
    else:
        print("unknown result type")
        print(results)

In [18]:
# preprorcess and remove all stubs
# validateAndAppend(dictionary, dictRngSchema, dictSchXSL)

In [19]:
for i in os.scandir(manannot):
    if i.name.startswith('Urfa') and i.name.endswith('.xml') and i.is_file():
        filename = os.path.basename(i)
        filepath = manannot + "/" + filename
        status = docStatus(filepath)
        print(filename, status)
        
        # if the document is not finished yet (//tei:revisionDesc/@status != DOCSTATUS_DONE), just ignore it
        if type(status) is str and status != VALIDATE_DOCS_WITH_STATUS:
            ignoredFiles.append({
                "source" : filepath,
                "type" : "ignored",
                "status": status
            })
        # if the document couldn't be parsed, docStatus() returns a dict 
        # with some error information which is appended to the list of 
        # validation errors
        elif type(status) is dict and status["type"] == "error":
            validationErrors.append(status) 
        
        # … otherwise try to validate the document
        else:
            validateAndAppend(filepath, rngSchema, schXSL)      

Urfa-000_Namrud-Harran-2001.xml done
validating ../010_manannot/Urfa-000_Namrud-Harran-2001.xml
[{'type': 'error', 'message': 'target of span should point at xml:id of parent annotationBlock element', 'line': 300, 'source': '../010_manannot/Urfa-000_Namrud-Harran-2001.xml', 'location': '/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]/tei:div[1]/tei:spanGrp[1]/tei:span[1]', 'stage': 'schematron', 'exceptionType': 'failed-assert'}, {'type': 'error', 'message': 'target of span should point at xml:id of parent annotationBlock element', 'line': 331, 'source': '../010_manannot/Urfa-000_Namrud-Harran-2001.xml', 'location': '/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]/tei:div[2]/tei:spanGrp[1]/tei:span[1]', 'stage': 'schematron', 'exceptionType': 'failed-assert'}, {'type': 'error', 'message': 'target of span should point at xml:id of parent annotationBlock element', 'line': 370, 'source': '../010_manannot/Urfa-000_Namrud-Harran-2001.xml', 'location': '/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:

In [25]:
# validationErrors

In [21]:
def make_clickable(source, line=None):
    link = source.replace('../','https://github.com/acdh-oeaw/shawi-data/tree/main/')
    if line:
        return f'<a href="{link}#L{line}">{source}</a>'
    else:
        return f'<a href="{link}">{source}</a>'

In [22]:
if len(ignoredFiles) > 0:
    df_ignored = pd.DataFrame(data=ignoredFiles).T
    df_ignored = df_ignored.transpose()
    df_ignored
    ignoredReport = "tmp/ignoredFiles.html"
    df_ignored['link'] = df_ignored.apply(lambda x: make_clickable(x['source']), axis=1)

    with open(ignoredReport, 'w', encoding='utf-8') as f:
        f.write(df_ignored.to_html(render_links=True, escape=False))

In [23]:
if len(validationErrors) > 0:
    df_err = pd.DataFrame(data=validationErrors).T
    df_err = df_err.transpose()
    df_err
    print(f"found {len(validationErrors)} validation errors")
    #df_err = df_err[df_err["message"].str.contains("@lemmaRef is required when xml:lang is 'ar-acm")==False]
    #df_err = df_err[df_err["message"].str.contains("There is no entry in the dictionary for the")==True]
    errorReport = "tmp/validationReport.html"
    df_err['link'] = df_err.apply(lambda x: make_clickable(x['source'], x['line']), axis=1)
    with open(errorReport, 'w', encoding='utf-8') as f:
        f.write(df_err.to_html(render_links=True, escape=False))

found 2038 validation errors


In [24]:
df_err

Unnamed: 0,type,message,line,source,location,stage,exceptionType,link
0,error,target of span should point at xml:id of paren...,300,../010_manannot/Urfa-000_Namrud-Harran-2001.xml,/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]...,schematron,failed-assert,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
1,error,target of span should point at xml:id of paren...,331,../010_manannot/Urfa-000_Namrud-Harran-2001.xml,/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]...,schematron,failed-assert,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
2,error,target of span should point at xml:id of paren...,370,../010_manannot/Urfa-000_Namrud-Harran-2001.xml,/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]...,schematron,failed-assert,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
3,error,target of span should point at xml:id of paren...,402,../010_manannot/Urfa-000_Namrud-Harran-2001.xml,/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]...,schematron,failed-assert,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
4,error,target of span should point at xml:id of paren...,443,../010_manannot/Urfa-000_Namrud-Harran-2001.xml,/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]...,schematron,failed-assert,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
...,...,...,...,...,...,...,...,...
2033,error,target of span should point at xml:id of paren...,4849,../010_manannot/Urfa-177_Wedding_in_Former_Tim...,/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]...,schematron,failed-assert,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
2034,error,target of span should point at xml:id of paren...,4893,../010_manannot/Urfa-177_Wedding_in_Former_Tim...,/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]...,schematron,failed-assert,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
2035,error,target of span should point at xml:id of paren...,4961,../010_manannot/Urfa-177_Wedding_in_Former_Tim...,/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]...,schematron,failed-assert,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
2036,error,The 'who' attribute of u must point to a perso...,1254,../010_manannot/Urfa-177_Wedding_in_Former_Tim...,/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]...,schematron,failed-assert,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
