Skip to content
Permalink
Browse files
[NO ISSUE][MISC] Improve parser error reporting
Change-Id: I8707d9e2a952693b6501e9e63aff9162a77541cc
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/11543
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Michael Blow <mblow@apache.org>
Reviewed-by: Till Westmann <tillw@apache.org>
  • Loading branch information
mblow committed May 21, 2021
1 parent 528ee18 commit 3b6982ce7fa50ff4005d50847a944ecbaf3ecb30
Show file tree
Hide file tree
Showing 10 changed files with 56 additions and 40 deletions.
@@ -143,11 +143,11 @@
<compilation-unit name="common/malformed-json">
<placeholder name="adapter" value="AZUREBLOB" />
<output-dir compare="Text">common/malformed-json</output-dir>
<expected-error>Parsing error at malformed-data/duplicate-fields.json line 1 field field: Duplicate field 'field'</expected-error>
<expected-error>Parsing error at malformed-data/malformed-json.json line 1 field field: Unexpected character ('}' (code 125)): was expecting double-quote to start field name</expected-error>
<expected-error>Parsing error at malformed-data/malformed-json-2.json line 4 field array_f: Unexpected character (']' (code 93)): expected a valid value (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
<expected-error>Parsing error at malformed-data/malformed-jsonl-1.json line 3 field field2: Unrecognized token 'truee': was expecting (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
<expected-error>Parsing error at malformed-data/malformed-jsonl-2.json line 11 field array_f: Unexpected character (']' (code 93)): expected a valid value (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
<expected-error>Parsing error at malformed-data/duplicate-fields.json line 1 field 'field': Duplicate field 'field'</expected-error>
<expected-error>Parsing error at malformed-data/malformed-json.json line 1 field 'field': Unexpected character ('}' (code 125)): was expecting double-quote to start field name</expected-error>
<expected-error>Parsing error at malformed-data/malformed-json-2.json line 4 field 'array_f': Unexpected character (']' (code 93)): expected a valid value (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
<expected-error>Parsing error at malformed-data/malformed-jsonl-1.json line 3 field 'field2': Unrecognized token 'truee': was expecting (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
<expected-error>Parsing error at malformed-data/malformed-jsonl-2.json line 11 field 'array_f': Unexpected character (']' (code 93)): expected a valid value (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
</compilation-unit>
</test-case>
<test-case FilePath="external-dataset">
@@ -92,11 +92,11 @@
<compilation-unit name="common/malformed-json">
<placeholder name="adapter" value="S3" />
<output-dir compare="Text">common/malformed-json</output-dir>
<expected-error>Parsing error at malformed-data/duplicate-fields.json line 1 field field: Duplicate field 'field'</expected-error>
<expected-error>Parsing error at malformed-data/malformed-json.json line 1 field field: Unexpected character ('}' (code 125)): was expecting double-quote to start field name</expected-error>
<expected-error>Parsing error at malformed-data/malformed-json-2.json line 4 field array_f: Unexpected character (']' (code 93)): expected a valid value (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
<expected-error>Parsing error at malformed-data/malformed-jsonl-1.json line 3 field field2: Unrecognized token 'truee': was expecting (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
<expected-error>Parsing error at malformed-data/malformed-jsonl-2.json line 11 field array_f: Unexpected character (']' (code 93)): expected a valid value (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
<expected-error>Parsing error at malformed-data/duplicate-fields.json line 1 field 'field': Duplicate field 'field'</expected-error>
<expected-error>Parsing error at malformed-data/malformed-json.json line 1 field 'field': Unexpected character ('}' (code 125)): was expecting double-quote to start field name</expected-error>
<expected-error>Parsing error at malformed-data/malformed-json-2.json line 4 field 'array_f': Unexpected character (']' (code 93)): expected a valid value (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
<expected-error>Parsing error at malformed-data/malformed-jsonl-1.json line 3 field 'field2': Unrecognized token 'truee': was expecting (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
<expected-error>Parsing error at malformed-data/malformed-jsonl-2.json line 11 field 'array_f': Unexpected character (']' (code 93)): expected a valid value (JSON String, Number, Array, Object or token 'null', 'true' or 'false')</expected-error>
</compilation-unit>
</test-case>
<test-case FilePath="external-dataset">
@@ -49,17 +49,17 @@
<output-dir compare="Text">common/csv-warnings</output-dir>
<expected-warn>Parsing error at data_dir/no_h_missing_fields.csv line 2 field 3: some fields are missing</expected-warn>
<expected-warn>Parsing error at data_dir/no_h_no_closing_q.csv line 2 field 0: malformed input record ended abruptly</expected-warn>
<expected-warn>Parsing error at line 2 field 0: malformed input record ended abruptly</expected-warn>
<expected-warn>Parsing error at line 2 field 0: malformed input record ended abruptly</expected-warn>

<expected-warn>Parsing error at line 5 field 3: invalid value</expected-warn>
<expected-warn>Parsing error at line 2 field 1: invalid value</expected-warn>
<expected-warn>Parsing error at line 11 field 1: invalid value</expected-warn>
<expected-warn>Parsing error at line 3 field 1: invalid value</expected-warn>
<expected-warn>Parsing error at line 4 field 1: invalid value</expected-warn>
<expected-warn>Parsing error at line 7 field 7: invalid value</expected-warn>
<expected-warn>Parsing error at line 13 field 7: invalid value</expected-warn>
<expected-warn>Parsing error at line 12 field 3: invalid value</expected-warn>
<expected-warn>Parsing error at line 9 field 6: a quote should be in the beginning</expected-warn>
<expected-warn>Parsing error at line 5 field 3: invalid value</expected-warn>
<expected-warn>Parsing error at line 2 field 1: invalid value</expected-warn>
<expected-warn>Parsing error at line 11 field 1: invalid value</expected-warn>
<expected-warn>Parsing error at line 3 field 1: invalid value</expected-warn>
<expected-warn>Parsing error at line 4 field 1: invalid value</expected-warn>
<expected-warn>Parsing error at line 7 field 7: invalid value</expected-warn>
<expected-warn>Parsing error at line 13 field 7: invalid value</expected-warn>
<expected-warn>Parsing error at line 12 field 3: invalid value</expected-warn>
<expected-warn>Parsing error at line 9 field 6: a quote should be in the beginning</expected-warn>

<expected-warn>Parsing error at data_dir/h_invalid_values.csv line 5 field 3: invalid value</expected-warn>
<expected-warn>Parsing error at data_dir/h_invalid_values.csv line 2 field 1: invalid value</expected-warn>
@@ -29,10 +29,10 @@
import org.apache.asterix.external.api.AsterixInputStream;
import org.apache.asterix.external.util.ExternalDataConstants;
import org.apache.asterix.external.util.ExternalDataUtils;
import org.apache.asterix.external.util.ParseUtil;
import org.apache.hyracks.api.context.IHyracksTaskContext;
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.api.exceptions.IWarningCollector;
import org.apache.hyracks.util.ParseUtil;

public class QuotedLineRecordReader extends LineRecordReader {

@@ -40,10 +40,10 @@
import org.apache.asterix.external.api.AsterixInputStream;
import org.apache.asterix.external.util.ExternalDataConstants;
import org.apache.asterix.external.util.ExternalDataUtils;
import org.apache.asterix.external.util.ParseUtil;
import org.apache.hyracks.api.context.IHyracksTaskContext;
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.api.exceptions.IWarningCollector;
import org.apache.hyracks.util.ParseUtil;

public class SemiStructuredRecordReader extends StreamRecordReader {

@@ -18,6 +18,8 @@
*/
package org.apache.asterix.external.parser;

import static org.apache.hyracks.api.exceptions.ErrorCode.PARSING_ERROR;

import java.io.DataOutput;
import java.io.IOException;
import java.util.BitSet;
@@ -46,6 +48,7 @@
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.api.util.ExceptionUtils;
import org.apache.hyracks.data.std.api.IMutableValueStorage;
import org.apache.hyracks.util.ParseUtil;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParseException;
@@ -433,18 +436,13 @@ protected HyracksDataException createException(Exception e) {
}
long lineNum = lineNumber.getAsLong() + jsonParser.getCurrentLocation().getLineNr() - 1;
JsonStreamContext parsingContext = jsonParser.getParsingContext();
String fieldName = "N/A";
while (parsingContext != null) {
String currentFieldName = parsingContext.getCurrentName();
if (currentFieldName != null) {
fieldName = currentFieldName;
break;
}
String fieldName = null;
while (parsingContext != null && fieldName == null) {
fieldName = parsingContext.getCurrentName();
parsingContext = parsingContext.getParent();
}

return HyracksDataException.create(org.apache.hyracks.api.exceptions.ErrorCode.PARSING_ERROR,
dataSourceName.get(), lineNum, fieldName, msg);
final String locationDetails = ParseUtil.asLocationDetailString(dataSourceName.get(), lineNum, fieldName);
return HyracksDataException.create(PARSING_ERROR, locationDetails, msg);
}
return new RuntimeDataException(ErrorCode.RECORD_READER_MALFORMED_INPUT_STREAM, e);
}
@@ -38,7 +38,6 @@
import org.apache.asterix.external.api.IRecordDataParser;
import org.apache.asterix.external.api.IStreamDataParser;
import org.apache.asterix.external.util.ExternalDataConstants;
import org.apache.asterix.external.util.ParseUtil;
import org.apache.asterix.om.base.AMutableString;
import org.apache.asterix.om.typecomputer.impl.TypeComputeUtils;
import org.apache.asterix.om.types.ARecordType;
@@ -52,6 +51,7 @@
import org.apache.hyracks.dataflow.common.data.parsers.IValueParser;
import org.apache.hyracks.dataflow.common.data.parsers.IValueParserFactory;
import org.apache.hyracks.dataflow.std.file.FieldCursorForDelimitedDataParser;
import org.apache.hyracks.util.ParseUtil;

public class DelimitedDataParser extends AbstractDataParser implements IStreamDataParser, IRecordDataParser<char[]> {

@@ -16,7 +16,9 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.asterix.external.util;
package org.apache.hyracks.util;

import java.util.StringJoiner;

import org.apache.hyracks.api.exceptions.ErrorCode;
import org.apache.hyracks.api.exceptions.IWarningCollector;
@@ -29,7 +31,24 @@ private ParseUtil() {

public static void warn(IWarningCollector warningCollector, String dataSourceName, long lineNum, int fieldNum,
String warnMessage) {
warningCollector
.warn(Warning.of(null, ErrorCode.PARSING_ERROR, dataSourceName, lineNum, fieldNum, warnMessage));
warningCollector.warn(Warning.of(null, ErrorCode.PARSING_ERROR,
asLocationDetailString(dataSourceName, lineNum, fieldNum), warnMessage));
}

public static String asLocationDetailString(String dataSource, long lineNum, Object fieldIdentifier) {
StringJoiner details = new StringJoiner(" ");
details.setEmptyValue("N/A");
if (dataSource != null && !dataSource.isEmpty()) {
details.add(dataSource);
}
if (lineNum >= 0) {
details.add("line " + lineNum);
}
if (fieldIdentifier instanceof Number) {
details.add("field " + fieldIdentifier);
} else if (fieldIdentifier instanceof String && !((String) fieldIdentifier).isEmpty()) {
details.add("field '" + fieldIdentifier + "'");
}
return "at " + details;
}
}
@@ -141,7 +141,7 @@
121 = A numeric type promotion error has occurred: %1$s
122 = Encountered an error while printing the plan: %1$s
123 = Insufficient memory is provided for the join operators, please increase the join memory budget.
124 = Parsing error at %1$s line %2$s field %3$s: %4$s
124 = Parsing error %s: %s
125 = Invalid inverted list type traits: %1$s
126 = Illegal state. %1$s

@@ -23,9 +23,8 @@
import java.util.Arrays;
import java.util.function.Supplier;

import org.apache.hyracks.api.exceptions.ErrorCode;
import org.apache.hyracks.api.exceptions.IWarningCollector;
import org.apache.hyracks.api.exceptions.Warning;
import org.apache.hyracks.util.ParseUtil;

public class FieldCursorForDelimitedDataParser {

@@ -448,6 +447,6 @@ public void eliminateDoubleQuote() {
}

private void warn(String message) {
warnings.warn(Warning.of(null, ErrorCode.PARSING_ERROR, dataSourceName.get(), lineCount, fieldCount, message));
ParseUtil.warn(warnings, dataSourceName.get(), lineCount, fieldCount, message);
}
}

0 comments on commit 3b6982c

Please sign in to comment.