Skip to content

Commit

Permalink
DRILL-7631: Updates to the Json Structure Parser
Browse files Browse the repository at this point in the history
Revised how "look-ahead" works. Added support for unknown
types.
  • Loading branch information
paul-rogers committed Mar 9, 2020
1 parent 485b19c commit b2566ef
Show file tree
Hide file tree
Showing 21 changed files with 1,034 additions and 561 deletions.
Expand Up @@ -18,6 +18,9 @@
package org.apache.drill.exec.store.easy.json.parser;


/**
* Abstract base class for all JSON element parsers.
*/
public abstract class AbstractElementParser implements ElementParser {
final JsonStructureParser structParser;
private final ElementParser parent;
Expand Down
Expand Up @@ -18,8 +18,8 @@
package org.apache.drill.exec.store.easy.json.parser;

/**
* Represents one level within an array. The first time the parser sees
* the array element, it will call one of the "Element" methods with the
* Represents one level within array. The first time the parser sees the array element,
* it will call the {@link #element(ValueDef)} method with the
* look-ahead values visible to the parser. Since JSON is flexible, later
* data shapes may not necessarily follow the first shape. The implementation
* must handle this or throw an error if not supported.
Expand All @@ -44,15 +44,16 @@
* Three JSON-specific cases warrant attention:
* <ol>
* <li>The first occurrence of the array is empty: {@code [ ]}. In this case,
* the structure parser will ask for an element child by providing the
* {@link JsonType#EMPTY} type, which is not very useful, but is all that
* the parser knows. The listener is responsible for implementing some kind of
* "deferred type" logic to wait and see what kind of element appears
* later..</li>
* the structure parser will defer asking for an element parser (and listener)
* until an actual value appears. The array listener is responsible for
* implementing some kind of "deferred type" logic to wait and see what
* kind of element appears later.</li>
* <li>The first occurrence of the array has, as its first element, a
* {@code null} value. The structure parser will ask this listener to create
* an array child for the {@code null} value, but the listener has no type
* information. Again, the listener is responsible for type-deferal.</li>
* information. Since null values must be recorded (so we know how many
* appear in each array), the listener is forced to choose a type. Choose
* wisely as there is no way to know what type will appear in the future.</li>
* <li>A generalized form of the above is that the structure parser only
* knows what it sees on the first element when it asks for an element
* child. In a well-formed file, that first token will predict the type
Expand Down Expand Up @@ -81,8 +82,19 @@
public interface ArrayListener {

/**
* Called at the start of a set of values for an array. That is, called
* when the structure parser accepts the {@code [} token.
* Provide an element listener for the first non-empty value
* seen for the array.
*
* @param valueDef description of the element (without the array
* dimensions)
* @return a listener to consume values of the array element
*/
ValueListener element(ValueDef valueDef);

/**
* Called at the entrance to each level (dimension) of an array.
* That is, called when the structure parser accepts the {@code [}
* token.
*/
void onStart();

Expand All @@ -91,61 +103,16 @@ public interface ArrayListener {
* by its own listener which receives the value of the element (if
* scalar) or element events (if structured.)
*/
void onElement();

/**
* Called at the end of a set of values for an array. That is, called
* when the structure parser accepts the {@code ]} token.
*/
void onEnd();
void onElementStart();

/**
* The first element seen is a scalar, {@code null} or empty. That is,
* {@code [ <scalar>}, {@code [ null} or {@code [ ]}.
*
* @param type the JSON type of the object as given by the token
* which the Jackson parser returned for the value. The type can
* be {@code null}, which means that the parser does not know what
* actual type might occur later
* @return a value listener for the scalar type, or if {@code null},
* perhaps waiting for more information to commit to a type
* Called after each element of the array.
*/
ValueListener scalarElement(JsonType type);
void onElementEnd();

/**
* The first element an array or scalars (or {@code null}.That is,
* {@code [ [+ <scalar>}.
*
* @param arrayDims the number of dimensions observed during the
* first-element parse, not including the surrounding array
* itself. As in all cases, there is no guarantee that
* that this number will remain valid later, and may be wrong if the
* first-seen element was empty: {@code []}.
* @return a listener for the value of the top-level element (which
* the listener can assume will turn out to be an array.)
*/
ValueListener arrayElement(int arrayDims, JsonType type);

/**
* The first element seen for an array is an object. That is,
* <code>[ {</code>.
*
* @return a listener for the value of the top-level element (which
* the listener can assume will turn out to be an object.)
*/
ValueListener objectElement();

/**
* The first element seen is an object array.That is,
* <code>[ [* {</code>.
*
* @param arrayDims the number of dimensions observed during the
* first-element parse, not including the surrounding array
* itself. As in all cases, there is no guarantee that
* that this number will remain valid later, and may be wrong if the
* first-seen element was empty: {@code []}.
* @return a listener for the value of the top-level element (which
* the listener can assume will turn out to be an array.)
* Called at the end of a set of values for an array. That is, called
* when the structure parser accepts the {@code ]} token.
*/
ValueListener objectArrayElement(int arrayDims);
void onEnd();
}
Expand Up @@ -17,13 +17,19 @@
*/
package org.apache.drill.exec.store.easy.json.parser;

import org.apache.drill.exec.store.easy.json.parser.ObjectListener.FieldType;

import com.fasterxml.jackson.core.JsonToken;

/**
* Parses a JSON array, which consists of a list of <i>elements</i>,
* represented by a {@code ValueListener}. There is a single listener
* for all the elements, which are presumed to be of the same type.
* <p>
* The element is created when first encountered, either as part of field
* creation (<code>{a: [10]}</code>) or when later encountered in parsing
* (<code{a: []} {a: [10]}</code>).
* <p>
* This parser <i>does not</i> attempt to parse an array as a poor-man's
* tuple: {@code [ 101, "fred", 23.45 ]}. The listener could handle this
* case. But, if we need to handle such a case, it would be better to
Expand All @@ -32,13 +38,12 @@
*/
public class ArrayParser extends AbstractElementParser {

private final ArrayListener arrayListener;
private final ValueParser elementParser;
private ValueParser elementParser;
private ArrayListener arrayListener;

public ArrayParser(ValueParser parent, ArrayListener arrayListener, ValueListener elementListener) {
public ArrayParser(ValueParser parent, ArrayListener arrayListener) {
super(parent);
this.arrayListener = arrayListener;
this.elementParser = new ValueParser(this, "[]", elementListener);
}

public ValueParser elementParser() { return elementParser; }
Expand All @@ -51,18 +56,58 @@ public void parse(TokenIterator tokenizer) {
arrayListener.onStart();
top: for (;;) {
// Position: [ (value, )* ^ ?
JsonToken token = tokenizer.requireNext();
JsonToken token = tokenizer.requireNext();
switch (token) {
case END_ARRAY:
break top;

default:
tokenizer.unget(token);
arrayListener.onElement();
elementParser.parse(tokenizer);
break;
parseElement(tokenizer);
}
}
arrayListener.onEnd();
}

private void parseElement(TokenIterator tokenizer) {
if (elementParser == null) {
detectElement(tokenizer);
}
arrayListener.onElementStart();
elementParser.parse(tokenizer);
arrayListener.onElementEnd();
}

private void detectElement(TokenIterator tokenizer) {
addElement(ValueDefFactory.lookAhead(tokenizer));
}

public void addElement(ValueDef valueDef) {
bindElement(arrayListener.element(valueDef));
}

public void bindElement(ValueListener elementListener) {
elementParser = new ValueParser(this, "[]", FieldType.TYPED);
elementParser.bindListener(elementListener);
}

public void bindListener(ArrayListener newListener) {
arrayListener = newListener;
if (elementParser != null) {
elementParser.bindListener(arrayListener.element(ValueDef.UNKNOWN));
}
}

/**
* Expand the structure of this array given a description of the
* look-ahead value. Skip if this is a 1D array of unknown type.
* If 2D or greater, then we must create the child array of one
* less dimension.
*/
public void expandStructure(ValueDef valueDef) {
if (valueDef.dimensions() > 1 || !valueDef.type().isUnknown()) {
ValueDef elementDef = new ValueDef(valueDef.type(), valueDef.dimensions() - 1);
addElement(elementDef);
elementParser.expandStructure(elementDef);
}
}
}
Expand Up @@ -55,14 +55,12 @@ public void parse(TokenIterator tokenizer) {
public void parseTail(TokenIterator tokenizer) {

// Parse (field: value)* }

for (;;) {
JsonToken token = tokenizer.requireNext();
switch (token) {

// Not exactly precise, but the JSON parser handles the
// details.

case END_OBJECT:
case END_ARRAY:
return;
Expand Down
Expand Up @@ -204,6 +204,18 @@ private boolean recover() {

public int recoverableErrorCount() { return errorRecoveryCount; }

public int lineNumber() {
return tokenizer.lineNumber();
}

public int columnNumber() {
return tokenizer.columnNumber();
}

public String token() {
return tokenizer.token();
}

public void close() {
if (errorRecoveryCount > 0) {
logger.warn("Read JSON input with {} recoverable error(s).",
Expand Down

This file was deleted.

0 comments on commit b2566ef

Please sign in to comment.