Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DRILL-4653.json - Malformed JSON should not stop the entire query from progressing #518

Closed
wants to merge 9 commits into from
Expand Up @@ -135,6 +135,9 @@ public interface ExecConstants {
BooleanValidator JSON_EXTENDED_TYPES = new BooleanValidator("store.json.extended_types", false);
BooleanValidator JSON_WRITER_UGLIFY = new BooleanValidator("store.json.writer.uglify", false);
BooleanValidator JSON_WRITER_SKIPNULLFIELDS = new BooleanValidator("store.json.writer.skip_null_fields", true);
String JSON_READER_SKIP_INVALID_RECORDS_FLAG = "store.json.reader.skip_invalid_records";
BooleanValidator JSON_SKIP_MALFORMED_RECORDS_VALIDATOR = new BooleanValidator(JSON_READER_SKIP_INVALID_RECORDS_FLAG, false);


DoubleValidator TEXT_ESTIMATED_ROW_SIZE = new RangeDoubleValidator(
"store.text.estimated_row_size_bytes", 1, Long.MAX_VALUE, 100.0);
Expand Down
Expand Up @@ -101,6 +101,7 @@ public class SystemOptionManager extends BaseOptionManager implements AutoClosea
ExecConstants.JSON_WRITER_UGLIFY,
ExecConstants.JSON_WRITER_SKIPNULLFIELDS,
ExecConstants.JSON_READ_NUMBERS_AS_DOUBLE_VALIDATOR,
ExecConstants.JSON_SKIP_MALFORMED_RECORDS_VALIDATOR,
ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL_VALIDATOR,
ExecConstants.MONGO_READER_ALL_TEXT_MODE_VALIDATOR,
ExecConstants.MONGO_READER_READ_NUMBERS_AS_DOUBLE_VALIDATOR,
Expand Down
Expand Up @@ -20,16 +20,15 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import com.google.common.collect.Lists;

import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.exception.OutOfMemoryException;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.OperatorContext;
import org.apache.drill.exec.physical.base.GroupScan;
import org.apache.drill.exec.physical.impl.OutputMutator;
import org.apache.drill.exec.store.AbstractRecordReader;
import org.apache.drill.exec.store.dfs.DrillFileSystem;
Expand Down Expand Up @@ -64,6 +63,8 @@ public class JSONRecordReader extends AbstractRecordReader {
private final boolean enableAllTextMode;
private final boolean readNumbersAsDouble;
private final boolean unionEnabled;
private long parseErrorCount;
private final boolean skipMalformedJSONRecords;

/**
* Create a JSON Record Reader that uses a file based input stream.
Expand Down Expand Up @@ -109,11 +110,11 @@ private JSONRecordReader(final FragmentContext fragmentContext, final String inp

this.fileSystem = fileSystem;
this.fragmentContext = fragmentContext;

// only enable all text mode if we aren't using embedded content mode.
this.enableAllTextMode = embeddedContent == null && fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_ALL_TEXT_MODE_VALIDATOR);
this.readNumbersAsDouble = embeddedContent == null && fragmentContext.getOptions().getOption(ExecConstants.JSON_READ_NUMBERS_AS_DOUBLE_VALIDATOR);
this.unionEnabled = embeddedContent == null && fragmentContext.getOptions().getOption(ExecConstants.ENABLE_UNION_TYPE);
this.skipMalformedJSONRecords = fragmentContext.getOptions().getOption(ExecConstants.JSON_SKIP_MALFORMED_RECORDS_VALIDATOR);
setColumns(columns);
}

Expand All @@ -122,7 +123,8 @@ public String toString() {
return super.toString()
+ "[hadoopPath = " + hadoopPath
+ ", recordCount = " + recordCount
+ ", runningRecordCount = " + runningRecordCount + ", ...]";
+ ", parseErrorCount = " + parseErrorCount
+ ", runningRecordCount = " + runningRecordCount + ", ...]";
}

@Override
Expand Down Expand Up @@ -189,39 +191,33 @@ private long currentRecordNumberInFile() {
public int next() {
writer.allocate();
writer.reset();

recordCount = 0;
ReadState write = null;
// Stopwatch p = new Stopwatch().start();
try{
outside: while(recordCount < DEFAULT_ROWS_PER_BATCH) {
writer.setPosition(recordCount);
write = jsonReader.write(writer);

if(write == ReadState.WRITE_SUCCEED) {
// logger.debug("Wrote record.");
recordCount++;
}else{
// logger.debug("Exiting.");
break outside;
}

outside: while(recordCount < DEFAULT_ROWS_PER_BATCH){
try
{
writer.setPosition(recordCount);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems this is still doing indent of 4. We use 2 spaces (see https://drill.apache.org/docs/apache-drill-contribution-guidelines/ scroll down to Step 2). Did it pass the mvn command line build without checkstyle violations ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aman,
maven checkstyle:checkstyle did not report any errors before I did my last
check in. I have changed to reflect 2 spaces for indendation.

On Thu, Jun 16, 2016 at 2:22 PM, Aman Sinha notifications@github.com
wrote:

In
exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/JSONRecordReader.java
#518 (comment):

  •  outside: while(recordCount < DEFAULT_ROWS_PER_BATCH) {
    
  •    writer.setPosition(recordCount);
    

- write = jsonReader.write(writer);

  •    if(write == ReadState.WRITE_SUCCEED) {
    
    -// logger.debug("Wrote record.");
  •      recordCount++;
    
  •    }else{
    
    -// logger.debug("Exiting.");
  •      break outside;
    

- }

  • outside: while(recordCount < DEFAULT_ROWS_PER_BATCH){
  • try
  •  {
    
  •        writer.setPosition(recordCount);
    

seems this is still doing indent of 4. We use 2 spaces (see
https://drill.apache.org/docs/apache-drill-contribution-guidelines/
scroll down to Step 2). Did it pass the mvn command line build without
checkstyle violations ?


You are receiving this because you commented.
Reply to this email directly, view it on GitHub
https://github.com/apache/drill/pull/518/files/56a16fe3f2bbd1554d65cef6faeaeb63ba41f9a2#r67426381,
or mute the thread
https://github.com/notifications/unsubscribe/ABsaHw_13TKQS-3mIEDhxkCiglpCBA3Sks5qMb6IgaJpZM4IzZkt
.

write = jsonReader.write(writer);
if(write == ReadState.WRITE_SUCCEED)
{
recordCount++;
}else
{
break outside;
}
}
catch(Exception ex)
{
++parseErrorCount;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the indentations seem to be off here as well as other places.. can you make sure the indentations match the rest of the code ?

logger.error("Error parsing JSON in " + hadoopPath.getName() + " : line nos :" + (recordCount+parseErrorCount));
if(skipMalformedJSONRecords == false){
handleAndRaise("Error parsing JSON", ex);}
}

jsonReader.ensureAtLeastOneField(writer);

writer.setValueCount(recordCount);
// p.stop();
// System.out.println(String.format("Wrote %d records in %dms.", recordCount, p.elapsed(TimeUnit.MILLISECONDS)));

updateRunningCount();
return recordCount;

} catch (final Exception e) {
handleAndRaise("Error parsing JSON", e);
}
// this is never reached
return 0;
jsonReader.ensureAtLeastOneField(writer);
writer.setValueCount(recordCount);
updateRunningCount();
return recordCount;
}

private void updateRunningCount() {
Expand Down
Expand Up @@ -20,6 +20,7 @@
import org.apache.drill.BaseTestQuery;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.exec.proto.UserBitShared;
import org.apache.drill.exec.ExecConstants;
import org.junit.Test;
import org.junit.Assert;

Expand Down Expand Up @@ -179,4 +180,43 @@ public void testNestedFilter() throws Exception {
.sqlBaselineQuery(baselineQuery)
.go();
}

@Test // See DRILL-4653
public void testSkippingInvalidJSONRecords() throws Exception {
try
{
String set = "alter session set `" + ExecConstants.JSON_READER_SKIP_INVALID_RECORDS_FLAG+ "` = true";

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these should be indented inside the try block with 2 spaces. It is best to set the indent level in your IDE (I can help with Eclipse if you are using it; if you are using IntelliJ I can find out from other developers using IntelliJ).

String query = "select count(*) from cp.`jsoninput/DRILL-4653.json`";
testNoResult(set);
testBuilder()
.unOrdered()
.sqlQuery(query)
.sqlBaselineQuery(query)
.build().run();
}
finally
{
String set = "alter session set `" + ExecConstants.JSON_READER_SKIP_INVALID_RECORDS_FLAG+ "` = false";
testNoResult(set);
}
}

@Test // See DRILL-4653
public void testNotSkippingInvalidJSONRecords() throws Exception {
try
{
String query = "select count(*) from cp.`jsoninput/DRILL-4653.json`";
testBuilder()
.unOrdered()
.sqlQuery(query)
.sqlBaselineQuery(query)
.build().run();
}
catch(Exception ex)
{
// do nothing just return
return;
}
throw new Exception("testNotSkippingInvalidJSONRecords");
}
}
40 changes: 40 additions & 0 deletions exec/java-exec/src/test/resources/jsoninput/DRILL-4653.json
@@ -0,0 +1,40 @@
{ "integer : 2010,
"float" : 17.4,
"x": {
"y": "kevin",
"z": "paul"
},
"z": [
{"orange" : "yellow" , "pink": "red"},
{"pink" : "purple" }
],
"l": [4,2],
"rl": [ [2,1], [4,6] ]
}
{ "integer : -2002,
"float" : -1.2
}
{ "integer" : 2001,
"float" : 1.2,
"x": {
"y": "bill",
"z": "peter"
},
"z": [
{"pink" : "lilac" }
],
"l": [4,2],
"rl": [ [2,1], [4,6] ]
}
{ "integer" : 6005,
"float" : 1.2,
"x": {
"y": "mike",
"z": "mary"
},
"z": [
{"orange" : "stucco" }
],
"l": [4,2],
"rl": [ [2,1], [4,6] ]
}