# Validating finished Shawi Corpus Files

* Install & set up dependencies
* extract schematron from RNG and transform to XSLT
* run schematron-xslt on files

In [1]:
import io
import os
import requests
import pathlib
import re
import sys
import json
import pandas as pd
import linecache as lc
import shutil

from pathlib import Path
from urllib.parse import urlsplit
import saxonche
from zipfile import ZipFile
from lxml import isoschematron, etree

In [None]:
tmpDir = "tmp"
libDir = "lib"
os.makedirs(tmpDir, exist_ok=True)
os.makedirs(libDir, exist_ok=True)
nss = {"tei":"http://www.tei-c.org/ns/1.0"}
# the root of the git repository
dataHome = ".."

# rng schema
rngSchema = dataHome + "/main/802_tei_odd/out/shawi_corpus.rng"
# the path to the annotated TEI transcription files
manannot = dataHome + "/main/010_manannot"

# include those documents in validation which have the following status in //tei:revisionDesc/@status / ignore all others
VALIDATE_DOCS_WITH_STATUS = "done"

with saxonche.PySaxonProcessor(license=False) as proc:
# SaxonC 1.2.1 Python has many known bugs but isn't maintained anymore
# Many of the documented API specs are not working
    print(proc.version)
    proc.set_cwd(os.path.dirname(os.path.abspath('')))
    print(proc.cwd)

SaxonC-HE 12.4 from Saxonica
C:\Users\mrauschsupola\Shawi\shawi-data


In [3]:
def downloadAndStore(url, force=False):   
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    dlFilePath = tmpDir + "/" + fn
    if not os.path.exists(dlFilePath) and not force == True:
        payload = requests.get(url).content
        open(dlFilePath, 'wb').write(payload)
    return dlFilePath

In [4]:
def downloadAndUnzip(url):    
    #  filename of the file to be downloaded
    fn = os.path.basename(url)
    # fn w/o extension
    basename = os.path.splitext(fn)[0]
    # extension 
    ext = os.path.splitext(fn)[1]
    
    if ext != ".zip":
        return "not a zip archive"
    else:
        zipFilePath = downloadAndStore(url)
        # the path where the content should be extracted to
        targetPath = libDir + "/" + basename
        
        
        payload = requests.get(url).content
        open(zipFilePath, 'wb').write(payload)
        ZipFile(zipFilePath).extractall(path=targetPath)
    
    return targetPath

### Install XSL based schematron validator

In [5]:
_schCompiler = None
def setupSchXSLT():
    global _schCompiler
    if _schCompiler is not None:
        return _schCompiler
    schDLURL = "https://github.com/schxslt/schxslt/releases/download/v1.9.5/schxslt-1.9.5-xslt-only.zip"
    schHome = downloadAndUnzip(schDLURL)
    _schCompiler = schHome + "/schxslt-1.9.5/2.0/pipeline-for-svrl.xsl"
    if os.path.exists(_schCompiler):
        return _schCompiler
    else: 
        print("error: something went wrong, cannot locate file '" + schCompiler + "'")

In [6]:
setupSchXSLT()

'lib/schxslt-1.9.5-xslt-only/schxslt-1.9.5/2.0/pipeline-for-svrl.xsl'

In [7]:
def transform(s, xsl, o, parameters=[]):
    # processor keeps files open on Windows and in doing so prevents moving or copying them
    with saxonche.PySaxonProcessor(license=False) as proc:
        proc.set_configuration_property("xi", "on")
        saxon = proc.new_xslt30_processor()
        for i in parameters:
            saxon.set_parameter(name=i, value=proc.make_string_value(parameters[i]))
        exec = saxon.compile_stylesheet(stylesheet_file=os.path.abspath(xsl))
        exec.apply_templates_returning_file(source_file=os.path.abspath(s), output_file=os.path.abspath(o))
        if exec.exception_occurred:
            exec.get_error_message
            #for i in range(saxon.exception_count()-1):
            print(saxon.get_error_message())
            print(os.path.abspath(s)+" - "+os.path.abspath(xsl)+" -> "+os.path.abspath(o)+" failed")
        if os.path.exists(os.path.abspath(o)):
            return o
        else: 
            print("there was an error transforming "+s+" with stylesheet "+xsl)

In [8]:
def extractSchematron(rng):
    """extracts a schematron document embedded in an rng schema"""
    print("extracting Schematron document from "+rng)
    rng2sch = setupRNG2Sch()
    sch = tmpDir + "/" + os.path.basename(rng) + ".sch"
    if not os.path.exists(sch):
        transform(rng, rng2sch, sch)
    return sch

In [9]:
def compileSchematron(sch):
    """compiles a schematron document to an XSLT stylesheet"""
    outputPath = tmpDir + "/" + os.path.basename(sch) + ".xsl"
    schCompiler = setupSchXSLT()
    
    transform(sch, schCompiler, outputPath)
    if os.path.exists(outputPath):
        return outputPath
    else: 
        print("error: something went wrong, cannot locate file '" + outputPath + "'")

## Prepare rng2sch stylesheet

Returns path to the xsl that extracts schematron form the RelaxNG schema.
This should only run once as the file gets locked (by saxon) and so further attempts to pring it to the correct location will fail.

In [10]:
_rng2sch = None 
def setupRNG2Sch():
    global _rng2sch
    if _rng2sch is not None:
        return _rng2sch
    RNG2SchtrDL = "https://raw.githubusercontent.com/Schematron/schematron/master/trunk/converters/code/ToSchematron/ExtractSchFromRNG.xsl"
    dltmp = downloadAndStore(RNG2SchtrDL)
    # tweak XSLT 
    with open(dltmp, encoding='utf-8') as inputfile:
        lines = inputfile.read()
    lines = lines.replace( 'http://www.ascc.net/xml/schematron','http://purl.oclc.org/dsdl/schematron/')
    lines = lines.replace( '<sch:schema','<sch:schema queryBinding="xslt2"')
    
    with open(dltmp, 'w', encoding='utf-8') as file:
        file.writelines(lines)
    _rng2sch = libDir+"/"+os.path.basename(dltmp)
    os.replace(dltmp, _rng2sch)
    if os.path.exists(_rng2sch):
        return _rng2sch
    else:
        print("error: something went wrong, cannot locate file '" + newPath + "'")

In [11]:
setupRNG2Sch()

'lib/ExtractSchFromRNG.xsl'

In [12]:
def schValidate(sch, path):
    """validates a document (at path) against schematron schema (at sch)"""
    errs = []
    out = tmpDir + "/validationReports/" + os.path.basename(path)
    xsl = compileSchematron(sch)
    try:
        transform(path, xsl, out)

    except saxonche.PySaxonApiError as e:
        # We swallow parsing errors here as they should have been reported by lxml already
        print("an error occured while running schValidate on "+path)
        return []
    
    report = etree.parse(out)
    successfulReport = report.findall("{http://purl.oclc.org/dsdl/svrl}successful-report")
    failedAssert = report.findall("{http://purl.oclc.org/dsdl/svrl}failed-assert")
    doc = etree.parse(path)
    for s in successfulReport + failedAssert:
        XPath = s.attrib['location'].replace('Q{http://www.tei-c.org/ns/1.0}','tei:').replace('Q{}','')
        # DEBUG
        node = doc.xpath(XPath, namespaces=nss)[0]
        if type(node) is etree._ElementUnicodeResult:
            elt = node.getparent()
        else:
            elt = node

        msg = s.find("{http://purl.oclc.org/dsdl/svrl}text").text

        errObj = {
            "type" : "error",
            "message":  msg,
            "line" : elt.sourceline,
            "source": path,
            "location": XPath,
            #"text": elt.text, #checking text values (for @lemmaRef)
            "stage": "schematron",
            "exceptionType": str(s.tag).replace("{http://purl.oclc.org/dsdl/svrl}",""),
        }
        
        errs.append(errObj)
    return errs
    

In [13]:
def validate(path, rngSchema):
    """Validate a document against the rngSchema. Returns a list of dicts of which each one represents a validation (or parsing) error."""
    validationErrors = []
    sch = extractSchematron(rngSchema)
    shutil.copyfile(sch, '../802_tei_odd/out/'+os.path.basename(sch)) #copy file from tmp folder into odd folder to be able to git it
    try:
        doc = etree.parse(path)
    
        # relaxng validation
        relaxng_doc = etree.parse(rngSchema)
        relaxng = etree.RelaxNG(relaxng_doc)
        relaxng.assertValid(doc)
        
        # schematron validation
        schErrs = schValidate(sch, path)
        if len(schErrs) >= 1:
            validationErrors = validationErrors + schErrs
    
    except etree.XMLSyntaxError as e:
        valErrObj = {
            "type" : "error",
            "message": str(e), 
            "line": e.lineno,
            "source": path, 
            "location": "n/a",
            "stage" : "parsing", 
            "exceptionType": type(e).__name__
        }
        return valErrObj
        
    except etree.DocumentInvalid as e:
        for error in e.error_log:
            # we ignore rng errors about @schemaLocation since 
            # that is needed for validation in the TEI-enricher
            if error.message != "Invalid attribute schemaLocation for element TEI":
                location = "n/a" if error.path is None else error.path
                valErrObj = {
                    "type" : "error",
                    "message": error.message, 
                    "line": error.line, 
                    "source": path, 
                    "location": location,
                    "stage" : "relaxng", 
                    "exceptionType": type(e).__name__
                }
                # DEBUG
                print(valErrObj)
                validationErrors.append(valErrObj)
        
        # if the document is invalid against the RNG, we still want to run schematron against it
        schErrs = schValidate(sch, path)
        if len(schErrs) >= 1:
            validationErrors = validationErrors + schErrs
        
        
    
    return validationErrors

In [14]:
validationErrors = []
ignoredFiles = []

In [15]:
def docStatus(path):
    """returns the status of the document at path; if the document can't be parsed, it returns a dict with the error"""
    try:
        doc = etree.parse(path)
        revisionDesc = doc.xpath("/tei:TEI/tei:teiHeader/tei:revisionDesc", namespaces=nss)[0]
        status = revisionDesc.attrib['status']
        return status
    # report documents which are not well-formed
    except etree.XMLSyntaxError as e:
        print(e)

In [16]:
for i in os.scandir(manannot):
    if i.name.startswith('Urfa') and i.name.endswith('.xml') and i.is_file():
        filename = os.path.basename(i)
        filepath = manannot + "/" + filename
        status = docStatus(filepath)
        print(filename, status)
        
        # if the document is not finished yet (//tei:revisionDesc/@status != DOCSTATUS_DONE), just ignore it
        if type(status) is str and status != VALIDATE_DOCS_WITH_STATUS:
            ignoredFiles.append({
                "source" : filepath,
                "type" : "ignored",
                "status": status
            })
        # if the document couldn't be parsed, docStatus() returns a dict 
        # with some error information which is appended to the list of 
        # validation errors
        elif type(status) is dict and status["type"] == "error":
            validationErrors.append(status) 
        
        # … otherwise try to validate the document
        else:
            print("validating " + filepath)
            results = validate(filepath, rngSchema)
            print(results)
            len(results)
            if type(results) is list:
                res_errs = filter(lambda x: x['type'] == "error", results)
                res_ignored = filter(lambda x: x['type'] == "ignored", results)
                validationErrors = validationErrors + list(res_errs)
                print(f"{len(list(res_errs))} found / {len(validationErrors)} in total")
            elif type(results) is dict:
                if results['type'] == "ignored":
                    ignoredFiles.append(results)
            else:
                print("unknown result type")
                print(results)
            

Urfa-000_Cigkofte-Harran-2001.xml generated
Urfa-002_1_Joke_about_a_tribe.xml done
validating ../010_manannot/Urfa-002_1_Joke_about_a_tribe.xml
extracting Schematron document from ../802_tei_odd/out/shawi_corpus.rng
{'type': 'error', 'message': 'Element span has extra content: text', 'line': 398, 'source': '../010_manannot/Urfa-002_1_Joke_about_a_tribe.xml', 'location': '/*/*[2]/*[2]/*/*[7]/*[2]/*', 'stage': 'relaxng', 'exceptionType': 'DocumentInvalid'}
[{'type': 'error', 'message': 'Element span has extra content: text', 'line': 398, 'source': '../010_manannot/Urfa-002_1_Joke_about_a_tribe.xml', 'location': '/*/*[2]/*[2]/*/*[7]/*[2]/*', 'stage': 'relaxng', 'exceptionType': 'DocumentInvalid'}]
0 found / 1 in total
Urfa-002_2_Blood_feud_in_the_past.xml generated
Urfa-002_3_The-incomplete_meal.xml generated
Urfa-002_4_The_bull_in_the_jar.xml generated
Urfa-011_Cemetry-Harran-2010.xml done
validating ../010_manannot/Urfa-011_Cemetry-Harran-2010.xml
extracting Schematron document from ../

In [17]:
def make_clickable(source, line=None):
    link = source.replace('../','https://github.com/acdh-oeaw/shawi-data/tree/main/')
    if line:
        return f'<a href="{link}#L{line}">{source}</a>'
    else:
        return f'<a href="{link}">{source}</a>'

In [18]:
if len(ignoredFiles) > 0:
    df_ignored = pd.DataFrame(data=ignoredFiles).T
    df_ignored = df_ignored.transpose()
    df_ignored
    ignoredReport = "tmp/ignoredFiles.html"
    df_ignored['link'] = df_ignored.apply(lambda x: make_clickable(x['source']), axis=1)

    with open(ignoredReport, 'w', encoding='utf-8') as f:
        f.write(df_ignored.to_html(render_links=True, escape=False))

In [21]:
if len(validationErrors) > 0:
    df_err = pd.DataFrame(data=validationErrors).T
    df_err = df_err.transpose()
    df_err
    print(f"found {len(validationErrors)} validation errors")
    #df_err = df_err[df_err["message"].str.contains("@lemmaRef is required when xml:lang is 'ar-acm")==False]
    #df_err = df_err[df_err["message"].str.contains("There is no entry in the dictionary for the")==True]
    errorReport = "tmp/validationReport.html"
    df_err['link'] = df_err.apply(lambda x: make_clickable(x['source'], x['line']), axis=1)
    with open(errorReport, 'w', encoding='utf-8') as f:
        f.write(df_err.to_html(render_links=True, escape=False))

found 29 validation errors


In [22]:
df_err

Unnamed: 0,type,message,line,source,location,stage,exceptionType,link
0,error,Element span has extra content: text,398,../010_manannot/Urfa-002_1_Joke_about_a_tribe.xml,/*/*[2]/*[2]/*/*[7]/*[2]/*,relaxng,DocumentInvalid,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
1,error,Element w failed to validate attributes,1250,../010_manannot/Urfa-014_Village_of_Qoran-Harr...,/*/*[2]/*[2]/*/*[19]/*[1]/*[41],relaxng,DocumentInvalid,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
2,error,There is no entry in the dictionary for the cu...,1525,../010_manannot/Urfa-024b_Vegetabels_and_Seaso...,/tei:TEI[1]/tei:text[1]/tei:body[1]/tei:div[1]...,schematron,failed-assert,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
3,error,Element span has extra content: text,329,../010_manannot/Urfa-032_Ali-Harran-2010.xml,/*/*[2]/*[2]/*/*[3]/*[2]/*,relaxng,DocumentInvalid,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
4,error,Element span has extra content: text,972,../010_manannot/Urfa-032_Ali-Harran-2010.xml,/*/*[2]/*[2]/*/*[20]/*[2]/*,relaxng,DocumentInvalid,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
5,error,Element span has extra content: text,1040,../010_manannot/Urfa-032_Ali-Harran-2010.xml,/*/*[2]/*[2]/*/*[22]/*[2]/*,relaxng,DocumentInvalid,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
6,error,Element span has extra content: text,1080,../010_manannot/Urfa-032_Ali-Harran-2010.xml,/*/*[2]/*[2]/*/*[23]/*[2]/*,relaxng,DocumentInvalid,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
7,error,Element w failed to validate attributes,1668,../010_manannot/Urfa-032_Ali-Harran-2010.xml,/*/*[2]/*[2]/*/*[40]/*[1]/*[22],relaxng,DocumentInvalid,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
8,error,Element w failed to validate attributes,1673,../010_manannot/Urfa-032_Ali-Harran-2010.xml,/*/*[2]/*[2]/*/*[40]/*[1]/*[27],relaxng,DocumentInvalid,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
9,error,Invalid attribute when for element change,134,../010_manannot/Urfa-054_Real_Friend-Harran-20...,/*/*[1]/*[4]/*,relaxng,DocumentInvalid,"<a href=""https://github.com/acdh-oeaw/shawi-da..."
