Skip to content

Commit

Permalink
Use Exificient to support EXI format for infosets
Browse files Browse the repository at this point in the history
Demonstrates how to use Exificient's EXI library with Daffodil by using
SAX.  Adds support to the daffodil-cli tool to parse and unparse EXI
infosets.
  • Loading branch information
alexanderrevello authored and jadams-tresys committed Jul 23, 2022
1 parent 497e310 commit c521ae3
Show file tree
Hide file tree
Showing 11 changed files with 141 additions and 28 deletions.
1 change: 1 addition & 0 deletions build.sbt
Expand Up @@ -112,6 +112,7 @@ lazy val cli = Project("daffodil-cli", file("daffodil-cli")).config
.dependsOn(tdmlProc, runtime2, sapi, japi, schematron % Runtime, udf % "it->test") // causes runtime2/sapi/japi to be pulled into the helper zip/tar
.settings(commonSettings, nopublish)
.settings(libraryDependencies ++= Dependencies.cli)
.settings(libraryDependencies ++= Dependencies.exi)

lazy val udf = Project("daffodil-udf", file("daffodil-udf")).configs(IntegrationTest)
.settings(commonSettings)
Expand Down
@@ -0,0 +1 @@
�����^[\K���@�L��c����e��
@@ -0,0 +1 @@
�����^[\K���@�Lpt�V���
Expand Up @@ -1300,6 +1300,28 @@ class TestCLIparsing {
}
}

@Test def test_XXX_CLI_Parsing_SimpleParse_exi(): Unit = {

val schemaFile = Util.daffodilPath("daffodil-test/src/test/resources/org/apache/daffodil/section00/general/generalSchema.dfdl.xsd")
val (testSchemaFile) = if (Util.isWindows) (Util.cmdConvert(schemaFile)) else (schemaFile)

val shell = Util.start("")

try {
val cmd = String.format(Util.echoN("Hello") + "| %s parse -I exi -s %s -r e1 | md5sum", Util.binPath, testSchemaFile)

shell.sendLine(cmd)
shell.expect(contains("937b3f96ee0b5cd1ac9f537cf8ddc580"))

Util.expectExitCode(ExitCode.Success, shell)

shell.send("exit\n")
shell.expect(eof)
} finally {
shell.close()
}
}

@Test def test_CLI_Error_Return_Codes(): Unit = {

val shell = Util.start("")
Expand Down
Expand Up @@ -68,6 +68,27 @@ class TestCLIPerformance {
}
}

@Test def test_XXX_CLI_Performance_2_Threads_2_Times_exi(): Unit = {
val schemaFile = Util.daffodilPath("daffodil-test/src/test/resources/org/apache/daffodil/section06/entities/charClassEntities.dfdl.xsd")
val inputFile = Util.daffodilPath("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input1.txt")
val (testSchemaFile, testInputFile) = if (Util.isWindows) (Util.cmdConvert(schemaFile), Util.cmdConvert(inputFile)) else (schemaFile, inputFile)

val shell = Util.start("")

try {
val cmd = String.format("%s performance -I exi -N 2 -t 2 -s %s -r matrix %s", Util.binPath, testSchemaFile, testInputFile)
shell.sendLine(cmd)
shell.expect(contains("total parse time (sec):"))
shell.expect(contains("avg rate (files/sec):"))

Util.expectExitCode(ExitCode.Success, shell)
shell.sendLine("exit")
shell.expect(eof())
} finally {
shell.close()
}
}

@Test def test_3394_CLI_Performance_3_Threads_20_Times(): Unit = {
val schemaFile = Util.daffodilPath("daffodil-test/src/test/resources/org/apache/daffodil/section06/entities/charClassEntities.dfdl.xsd")
val inputFile = Util.daffodilPath("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input1.txt")
Expand Down Expand Up @@ -175,6 +196,27 @@ class TestCLIPerformance {
}
}

@Test def test_XXX_CLI_Performance_Unparse_2_Threads_2_Times_exi(): Unit = {
val schemaFile = Util.daffodilPath("daffodil-test/src/test/resources/org/apache/daffodil/section00/general/generalSchema.dfdl.xsd")
val inputFile = Util.daffodilPath("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input14.exi")
val (testSchemaFile, testInputFile) = if (Util.isWindows) (Util.cmdConvert(schemaFile), Util.cmdConvert(inputFile)) else (schemaFile, inputFile)

val shell = Util.start("")

try {
val cmd = String.format("%s performance --unparse -I exi -N 2 -t 2 -s %s -r e3 %s", Util.binPath, testSchemaFile, testInputFile)
shell.sendLine(cmd)
shell.expect(contains("total unparse time (sec):"))
shell.expect(contains("avg rate (files/sec):"))

Util.expectExitCode(ExitCode.Success, shell)
shell.sendLine("exit")
shell.expect(eof())
} finally {
shell.close()
}
}

@Test def test_XXX_CLI_Performance_Unparse_2_Threads_2_Times_null(): Unit = {
val schemaFile = Util.daffodilPath("daffodil-test/src/test/resources/org/apache/daffodil/section00/general/generalSchema.dfdl.xsd")
val inputFile = Util.daffodilPath("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input14.txt")
Expand Down
Expand Up @@ -606,6 +606,28 @@ class TestCLIunparsing {
}
}

@Test def test_xxxx_CLI_Unparsing_SimpleUnparse_exi(): Unit = {

val schemaFile = Util.daffodilPath("daffodil-test/src/test/resources/org/apache/daffodil/section00/general/generalSchema.dfdl.xsd")
val inputFile = Util.daffodilPath("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input18.exi")
val (testSchemaFile, testInputFile) = if (Util.isWindows) (Util.cmdConvert(schemaFile), Util.cmdConvert(inputFile)) else (schemaFile, inputFile)

val shell = Util.start("")

try {
val cmd = String.format("%s unparse -I exi -s %s --root e1 %s", Util.binPath, testSchemaFile, testInputFile)
shell.sendLine(cmd)
shell.expect(contains("Hello"))

Util.expectExitCode(ExitCode.Success, shell)
shell.send("exit\n")
shell.expect(eof)
shell.close()
} finally {
shell.close()
}
}

@Test def test_xxxx_CLI_Unparsing_SimpleUnparse_null(): Unit = {

val schemaFile = Util.daffodilPath("daffodil-test/src/test/resources/org/apache/daffodil/section00/general/generalSchema.dfdl.xsd")
Expand Down
56 changes: 39 additions & 17 deletions daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
Expand Up @@ -46,10 +46,15 @@ import org.rogach.scallop.ArgType
import org.rogach.scallop.ScallopOption
import org.rogach.scallop.ValueConverter
import org.xml.sax.XMLReader
import org.xml.sax.ContentHandler
import org.apache.logging.log4j.Level
import org.apache.logging.log4j.core.config.Configurator
import org.apache.commons.io.IOUtils
import org.apache.commons.io.output.NullOutputStream
import com.siemens.ct.exi.core.helpers.DefaultEXIFactory
import com.siemens.ct.exi.core.FidelityOptions
import com.siemens.ct.exi.main.api.sax.EXIResult
import com.siemens.ct.exi.main.api.sax.EXISource
import org.apache.daffodil.Main.ExitCode
import org.apache.daffodil.api.DFDL
import org.apache.daffodil.api.DFDL.DaffodilUnparseErrorSAXException
Expand Down Expand Up @@ -119,6 +124,7 @@ object InfosetType extends Enumeration {
val SCALA_XML = Value("scala-xml")
val W3CDOM = Value("w3cdom")
val XML = Value("xml")
val EXI = Value("exi")
val NULL = Value("null")
}

Expand Down Expand Up @@ -304,7 +310,7 @@ class CLIConf(arguments: Array[String]) extends scallop.ScallopConf(arguments) {

val config = opt[File](short = 'c', argName = "file", descr = "XML file containing configuration items")
val vars = props[String](name = 'D', keyName = "variable", valueName = "value", descr = "Variables to be used when parsing. Can be prefixed with {namespace}.")
val infosetType = opt[InfosetType.Type](short = 'I', argName = "infoset_type", descr = "Infoset type to output. Use 'xml', 'scala-xml', 'json', 'jdom', 'w3cdom', 'sax', or 'null'. Defaults to 'xml'.", default = Some(InfosetType.XML))
val infosetType = opt[InfosetType.Type](short = 'I', argName = "infoset_type", descr = "Infoset type to output. Use 'xml', 'scala-xml', 'json', 'jdom', 'w3cdom', 'sax', 'exi', or 'null'. Defaults to 'xml'.", default = Some(InfosetType.XML))
val output = opt[String](argName = "file", descr = "Output file to write infoset to. If not given or is -, infoset is written to stdout.")
val parser = opt[File](short = 'P', argName = "file", descr = "Previously saved parser to reuse")
val path = opt[String](argName = "path", descr = "Path from root element to node from which to start parsing", hidden = true)
Expand All @@ -330,6 +336,10 @@ class CLIConf(arguments: Array[String]) extends scallop.ScallopConf(arguments) {
case _ => Right(Unit)
}

validateOpt(infosetType, stream) {
case (Some(InfosetType.EXI), Some(true)) => Left("Streaming mode is not currently supported with EXI infosets.")
case _ => Right(Unit)
}
}

// Unparse Subcommand Options
Expand All @@ -348,7 +358,7 @@ class CLIConf(arguments: Array[String]) extends scallop.ScallopConf(arguments) {

val config = opt[File](short = 'c', argName = "file", descr = "XML file containing configuration items")
val vars = props[String](name = 'D', keyName = "variable", valueName = "value", descr = "Variables to be used when parsing. Can be prefixed with {namespace}.")
val infosetType = opt[InfosetType.Type](short = 'I', argName = "infoset_type", descr = "Infoset type to unparse. Use 'xml', 'scala-xml', 'json', 'jdom', 'w3cdom', 'sax', or 'null'. Defaults to 'xml'.", default = Some(InfosetType.XML))
val infosetType = opt[InfosetType.Type](short = 'I', argName = "infoset_type", descr = "Infoset type to unparse. Use 'xml', 'scala-xml', 'json', 'jdom', 'w3cdom', 'sax', 'exi', or 'null'. Defaults to 'xml'.", default = Some(InfosetType.XML))
val output = opt[String](argName = "file", descr = "Output file to write data to. If not given or is -, data is written to stdout.")
val parser = opt[File](short = 'P', argName = "file", descr = "Previously saved parser to reuse")
val path = opt[String](argName = "path", descr = "Path from root element to node from which to start unparsing", hidden = true)
Expand Down Expand Up @@ -435,7 +445,7 @@ class CLIConf(arguments: Array[String]) extends scallop.ScallopConf(arguments) {

val config = opt[File](short = 'c', argName = "file", descr = "XML file containing configuration items")
val vars = props[String](name = 'D', keyName = "variable", valueName = "value", descr = "Variables to be used when parsing. Can be prefixed with {namespace}.")
val infosetType = opt[InfosetType.Type](short = 'I', argName = "infoset_type", descr = "Infoset type to output or unparse. Use 'xml', 'scala-xml', 'json', 'jdom', 'w3cdom', 'sax', or 'null'. Defaults to 'xml'.", default = Some(InfosetType.XML))
val infosetType = opt[InfosetType.Type](short = 'I', argName = "infoset_type", descr = "Infoset type to output or unparse. Use 'xml', 'scala-xml', 'json', 'jdom', 'w3cdom', 'sax', 'exi', or 'null'. Defaults to 'xml'.", default = Some(InfosetType.XML))
val number = opt[Int](short = 'N', argName = "number", default = Some(1), descr = "Total number of files to process. Defaults to 1.")
val parser = opt[File](short = 'P', argName = "file", descr = "Previously saved parser to reuse")
val path = opt[String](argName = "path", descr = "Path from root element to node from which to start parsing or unparsing", hidden = true)
Expand Down Expand Up @@ -676,14 +686,21 @@ object Main {
val blobSuffix = ".bin"

def getInfosetOutputter(infosetType: InfosetType.Type, os: java.io.OutputStream)
: Either[InfosetOutputter, DaffodilParseOutputStreamContentHandler] = {
: Either[InfosetOutputter, ContentHandler] = {
val outputter = infosetType match {
case InfosetType.XML => Left(new XMLTextInfosetOutputter(os, pretty = true))
case InfosetType.SCALA_XML => Left(new ScalaXMLInfosetOutputter())
case InfosetType.JSON => Left(new JsonInfosetOutputter(os, pretty = true))
case InfosetType.JDOM => Left(new JDOMInfosetOutputter())
case InfosetType.W3CDOM => Left(new W3CDOMInfosetOutputter())
case InfosetType.SAX => Right(new DaffodilParseOutputStreamContentHandler(os, pretty = true))
case InfosetType.SAX => Right(new DaffodilParseOutputStreamContentHandler(os, pretty=true))
case InfosetType.EXI => {
val exiFactory = DefaultEXIFactory.newInstance()
exiFactory.getFidelityOptions.setFidelity(FidelityOptions.FEATURE_PREFIX, true)
val exiResult = new EXIResult()
exiResult.setOutputStream(os)
Right(exiResult.getHandler)
}
case InfosetType.NULL => Left(new NullInfosetOutputter())
}
if (outputter.isLeft) {
Expand Down Expand Up @@ -723,7 +740,7 @@ object Main {
*/
def infosetDataToInputterData(infosetType: InfosetType.Type, data: Either[Array[Byte],InputStream]): AnyRef = {
infosetType match {
case InfosetType.XML | InfosetType.JSON | InfosetType.SAX => data match {
case InfosetType.XML | InfosetType.JSON | InfosetType.SAX | InfosetType.EXI => data match {
case Left(bytes) => bytes
case Right(is) => is
}
Expand Down Expand Up @@ -810,14 +827,14 @@ object Main {
val tl = anyRef.asInstanceOf[ThreadLocal[org.w3c.dom.Document]]
Left(new W3CDOMInfosetInputter(tl.get))
}
case InfosetType.EXI | InfosetType.SAX => {
val dp = processor
Right(dp.newContentHandlerInstance(outChannel))
}
case InfosetType.NULL => {
val events = anyRef.asInstanceOf[Array[NullInfosetInputter.Event]]
Left(new NullInfosetInputter(events))
}
case InfosetType.SAX => {
val dp = processor
Right(dp.newContentHandlerInstance(outChannel))
}
}
}

Expand Down Expand Up @@ -887,8 +904,6 @@ object Main {

val parseResult = eitherOutputterOrHandler match {
case Right(saxContentHandler) =>
// reset in case we are streaming
saxContentHandler.reset()
Timer.getResult("parsing",
parseWithSAX(processor, inStream, saxContentHandler))
case Left(outputter) =>
Expand Down Expand Up @@ -1089,7 +1104,7 @@ object Main {
case bytes: Array[Byte] => new ByteArrayInputStream(bytes)
case is: InputStream => is
}
unparseWithSAX(is, contentHandler)
unparseWithSAX(is, contentHandler, infosetType)
}
})
case Right(data) => Timer.getTimeResult({
Expand Down Expand Up @@ -1218,7 +1233,7 @@ object Main {
case bytes: Array[Byte] => new ByteArrayInputStream(bytes)
case is: InputStream => is
}
Timer.getResult("unparsing", unparseWithSAX(is, contentHandler))
Timer.getResult("unparsing", unparseWithSAX(is, contentHandler, unparseOpts.infosetType.toOption.get))
}

displayDiagnostics(unparseResult)
Expand Down Expand Up @@ -1416,8 +1431,15 @@ object Main {

private def unparseWithSAX(
is: InputStream,
contentHandler: DFDL.DaffodilUnparseContentHandler): UnparseResult = {
val xmlReader = DaffodilSAXParserFactory().newSAXParser.getXMLReader
contentHandler: DFDL.DaffodilUnparseContentHandler,
infosetType: InfosetType.Type): UnparseResult = {
val xmlReader = infosetType match {
case InfosetType.EXI => {
val exiSource = new EXISource()
exiSource.getXMLReader
}
case _ => DaffodilSAXParserFactory().newSAXParser.getXMLReader
}
xmlReader.setContentHandler(contentHandler)
xmlReader.setFeature(XMLUtils.SAX_NAMESPACES_FEATURE, true)
xmlReader.setFeature(XMLUtils.SAX_NAMESPACE_PREFIXES_FEATURE, true)
Expand All @@ -1434,7 +1456,7 @@ object Main {
private def parseWithSAX(
processor: DFDL.DataProcessor,
data: InputSourceDataInputStream,
saxContentHandler: DaffodilParseOutputStreamContentHandler): ParseResult = {
saxContentHandler: ContentHandler): ParseResult = {
val saxXmlRdr = processor.newXMLReaderInstance
saxXmlRdr.setContentHandler(saxContentHandler)
saxXmlRdr.setProperty(XMLUtils.DAFFODIL_SAX_URN_BLOBDIRECTORY, blobDir)
Expand Down
Expand Up @@ -20,11 +20,11 @@ import org.xml.sax.InputSource

import java.net.URI
import scala.xml.Node
import java.io.FileInputStream
import org.apache.daffodil.xml.XMLUtils
import org.apache.commons.io.input.XmlStreamReader

import java.io.File
import java.io.FileInputStream
import java.nio.file.Paths
import org.apache.daffodil.exceptions.Assert
import org.apache.daffodil.equality._
Expand Down
Expand Up @@ -68,28 +68,24 @@ class DaffodilParseOutputStreamContentHandler(out: OutputStream, pretty: Boolean
// if the top of the stack is true, we have guessed we should output a newline
private def outputNewline: Boolean = outputNewlineStack.top

def reset(): Unit = {
override def setDocumentLocator(locator: Locator): Unit = {
// do nothing
}

override def startDocument(): Unit = {
resetIndentation()
writer.flush()
activePrefixMapping = null
currentElementPrefixMapping = null
activePrefixMappingContextStack.clear()
outputNewlineStack.clear()
outputNewlineStack.push(false) //to match initialization state
out.flush()
}

override def setDocumentLocator(locator: Locator): Unit = {
// do nothing
}

override def startDocument(): Unit = {
writer.write("""<?xml version="1.0" encoding="UTF-8"?>""")
}

override def endDocument(): Unit = {
writer.write(System.lineSeparator())
writer.flush()
out.flush()
}

override def startPrefixMapping(prefix: String, uri: String): Unit = {
Expand Down
4 changes: 4 additions & 0 deletions project/Dependencies.scala
Expand Up @@ -58,4 +58,8 @@ object Dependencies {
lazy val schematron = Seq(
"net.sf.saxon" % "Saxon-HE" % "11.3",
)

lazy val exi = Seq(
"com.siemens.ct.exi" % "exificient" % "1.0.4",
)
}
2 changes: 2 additions & 0 deletions project/Rat.scala
Expand Up @@ -70,10 +70,12 @@ object Rat {
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input12.txt"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input13.txt"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input14.txt"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input14.exi"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input15.txt"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input16.txt"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input18.json"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input18.txt"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input18.exi"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/input19.txt"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/inputBig1M.txt"),
file("daffodil-cli/src/it/resources/org/apache/daffodil/CLI/input/prefix.txt"),
Expand Down

0 comments on commit c521ae3

Please sign in to comment.