-
Notifications
You must be signed in to change notification settings - Fork 1
/
XmlParser.scala
158 lines (136 loc) · 6.99 KB
/
XmlParser.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
package io.github.agolovenko.avro.xml
import io.github.agolovenko.avro.PathEntry.{ArrayEntry, FieldEntry}
import io.github.agolovenko.avro._
import org.apache.avro.Schema
import org.apache.avro.Schema.Type._
import org.apache.avro.generic.GenericData
import scala.jdk.CollectionConverters._
import scala.util.Try
import scala.xml._
class XmlParser(
schema: Schema,
stringParsers: PartialFunction[ParserContext, Any] = PartialFunction.empty,
validations: PartialFunction[ValidationContext, Unit] = PartialFunction.empty,
renameRules: RenameRules = RenameRules.empty
) extends AbstractParser[Elem](stringParsers, validations) {
override def apply(data: Elem): GenericData.Record = {
implicit val path: Path = Path.empty
if (schema.getType == RECORD)
if (schema.getName == data.label) readAny(data, attributes = None, schema, defaultValue = None).asInstanceOf[GenericData.Record]
else throw new ParserException(s"Expected '${schema.getName}' root node, got instead: '${data.label}'")
else throw new ParserException(s"Unsupported root schema of type ${schema.getType}")
}
private def readAny(data: NodeSeq, attributes: Option[Seq[Node]], schema: Schema, defaultValue: Option[Any])(implicit path: Path): Any = {
val result = schema.getType match {
case RECORD => readRecord(data, schema, defaultValue)
case ENUM => readEnum(data, attributes, schema, defaultValue)
case MAP => throw new ParserException("'MAP' type is not supported for XML format")
case ARRAY => readArray(data, schema, defaultValue)
case UNION => readUnion(data, attributes, schema, defaultValue)
case BYTES => readBytes(data, attributes, schema, defaultValue)
case FIXED => readFixed(data, attributes, schema, defaultValue)
case STRING => read(data, attributes, schema, defaultValue)
case INT => read(data, attributes, schema, defaultValue)
case LONG => read(data, attributes, schema, defaultValue)
case FLOAT => read(data, attributes, schema, defaultValue)
case DOUBLE => read(data, attributes, schema, defaultValue)
case BOOLEAN => read(data, attributes, schema, defaultValue)
case NULL => readNull(data, attributes, schema, defaultValue)
}
validate(result, schema)
result
}
private def readRecord(data: NodeSeq, schema: Schema, defaultValue: Option[Any])(implicit path: Path): GenericData.Record =
data match {
case SingleNode(elem: Elem) =>
val result = new GenericData.Record(schema)
schema.getFields.asScala.foreach { field =>
val fieldName = renameRules(field.name())
path.push(FieldEntry(fieldName))
try {
val value = readAny(elem \ fieldName, elem.attributes.get(field.name()).map(_.toSeq), field.schema(), Option(field.defaultVal()))
result.put(field.name(), value)
} finally {
path.pop()
()
}
}
result
case NoNode(_) => fallbackToDefault(defaultValue, schema).asInstanceOf[GenericData.Record]
case _ => throw new WrongTypeException(schema, data.toString())
}
private def readEnum(data: NodeSeq, attributes: Option[Seq[Node]], schema: Schema, defaultValue: Option[Any])(
implicit path: Path
): GenericData.EnumSymbol = {
val symbol = read(data, attributes, schema, defaultValue)
if (schema.getEnumSymbols.contains(symbol)) new GenericData.EnumSymbol(schema, symbol)
else throw new WrongTypeException(schema, data.toString())
}
private def readArray(data: NodeSeq, schema: Schema, defaultValue: Option[Any])(implicit path: Path): GenericData.Array[Any] = {
val elemLabel =
if (schema.getElementType.getType == RECORD)
schema.getElementType.getName
else path.peek.field
data match {
case SingleNode(elem: Elem) if elem.label != elemLabel => parseArray(elem.child, schema)
case NoNode(_) => fallbackToDefault(defaultValue, schema).asInstanceOf[GenericData.Array[Any]]
case nodes if nodes.forall(_.label == elemLabel) => parseArray(nodes, schema)
case _ => throw new WrongTypeException(schema, data.toString())
}
}
private def parseArray(nodes: Seq[Node], schema: Schema)(implicit path: Path): GenericData.Array[Any] = {
val elems = nodes.collect { case elem: Elem => elem }
val result = new GenericData.Array[Any](elems.size, schema)
elems.zipWithIndex.foreach {
case (child, idx) =>
path.push(ArrayEntry(idx))
try {
val value = readAny(child, attributes = None, schema.getElementType, None)
result.add(idx, value)
} finally {
path.pop()
()
}
}
result
}
private def readUnion(data: NodeSeq, attributes: Option[Seq[Node]], schema: Schema, defaultValue: Option[Any])(implicit path: Path): Any = {
def unionIt = schema.getTypes.asScala.iterator.zipWithIndex.map {
case (subSchema, idx) => Try(readAny(data, attributes, subSchema, defaultValue.filter(_ => idx == 0)))
}
val it = unionIt.flatMap(_.toOption)
if (it.hasNext) it.next()
else if (data.isEmpty) throw new MissingValueException(schema)
else {
val explanations = unionIt.flatMap(_.failed.map(_.getMessage).toOption).toSeq
throw new WrongTypeException(schema, data.toString(), explanations)
}
}
private def readBytes(data: NodeSeq, attributes: Option[Seq[Node]], schema: Schema, defaultValue: Option[Any])(implicit path: Path): Any =
read(data, attributes, schema, defaultValue)
private def readFixed(data: NodeSeq, attributes: Option[Seq[Node]], schema: Schema, defaultValue: Option[Any])(
implicit path: Path
): GenericData.Fixed = {
val bytes = readBytes(data, attributes, schema, defaultValue).asInstanceOf[Array[Byte]]
if (bytes.length == schema.getFixedSize) new GenericData.Fixed(schema, bytes)
else throw new WrongTypeException(schema, data.toString(), Seq(s"incorrect size: ${bytes.length} instead of ${schema.getFixedSize}"))
}
private def read(data: NodeSeq, attributes: Option[Seq[Node]], schema: Schema, defaultValue: Option[Any])(implicit path: Path): Any =
TextNode
.toText(attributes)
.map(parseString(_, schema))
.getOrElse {
data match {
case NoNode(_) => fallbackToDefault(defaultValue, schema)
case EmptyNode(_) => parseString("", schema)
case TextNode(text) => parseString(text, schema)
case _ => throw new WrongTypeException(schema, data.toString())
}
}
private def readNull(data: NodeSeq, attributes: Option[Seq[Node]], schema: Schema, defaultValue: Option[Any])(implicit path: Path): Null =
data match {
case NoNode(_) if attributes.isEmpty => fallbackToDefault(defaultValue, schema).asInstanceOf[Null]
case EmptyNode(_) if attributes.isEmpty => null
case _ => throw new WrongTypeException(schema, data.toString())
}
}