New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DRILL-4653.json - Malformed JSON should not stop the entire query from progressing #518
Changes from 3 commits
4fc70fa
8d5e059
56a16fe
e5a9a5b
5d5bca4
0ed75ef
6ed7330
45de6f7
8b492eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,16 +20,15 @@ | |
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.util.List; | ||
|
||
import com.google.common.collect.Lists; | ||
|
||
import org.apache.drill.common.exceptions.ExecutionSetupException; | ||
import org.apache.drill.common.exceptions.UserException; | ||
import org.apache.drill.common.expression.SchemaPath; | ||
import org.apache.drill.exec.ExecConstants; | ||
import org.apache.drill.exec.exception.OutOfMemoryException; | ||
import org.apache.drill.exec.ops.FragmentContext; | ||
import org.apache.drill.exec.ops.OperatorContext; | ||
import org.apache.drill.exec.physical.base.GroupScan; | ||
import org.apache.drill.exec.physical.impl.OutputMutator; | ||
import org.apache.drill.exec.store.AbstractRecordReader; | ||
import org.apache.drill.exec.store.dfs.DrillFileSystem; | ||
|
@@ -64,6 +63,8 @@ public class JSONRecordReader extends AbstractRecordReader { | |
private final boolean enableAllTextMode; | ||
private final boolean readNumbersAsDouble; | ||
private final boolean unionEnabled; | ||
private long parseErrorCount; | ||
private final boolean skipMalformedJSONRecords; | ||
|
||
/** | ||
* Create a JSON Record Reader that uses a file based input stream. | ||
|
@@ -109,11 +110,11 @@ private JSONRecordReader(final FragmentContext fragmentContext, final String inp | |
|
||
this.fileSystem = fileSystem; | ||
this.fragmentContext = fragmentContext; | ||
|
||
// only enable all text mode if we aren't using embedded content mode. | ||
this.enableAllTextMode = embeddedContent == null && fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_ALL_TEXT_MODE_VALIDATOR); | ||
this.readNumbersAsDouble = embeddedContent == null && fragmentContext.getOptions().getOption(ExecConstants.JSON_READ_NUMBERS_AS_DOUBLE_VALIDATOR); | ||
this.unionEnabled = embeddedContent == null && fragmentContext.getOptions().getOption(ExecConstants.ENABLE_UNION_TYPE); | ||
this.skipMalformedJSONRecords = fragmentContext.getOptions().getOption(ExecConstants.JSON_SKIP_MALFORMED_RECORDS_VALIDATOR); | ||
setColumns(columns); | ||
} | ||
|
||
|
@@ -122,7 +123,8 @@ public String toString() { | |
return super.toString() | ||
+ "[hadoopPath = " + hadoopPath | ||
+ ", recordCount = " + recordCount | ||
+ ", runningRecordCount = " + runningRecordCount + ", ...]"; | ||
+ ", parseErrorCount = " + parseErrorCount | ||
+ ", runningRecordCount = " + runningRecordCount + ", ...]"; | ||
} | ||
|
||
@Override | ||
|
@@ -189,39 +191,33 @@ private long currentRecordNumberInFile() { | |
public int next() { | ||
writer.allocate(); | ||
writer.reset(); | ||
|
||
recordCount = 0; | ||
ReadState write = null; | ||
// Stopwatch p = new Stopwatch().start(); | ||
try{ | ||
outside: while(recordCount < DEFAULT_ROWS_PER_BATCH) { | ||
writer.setPosition(recordCount); | ||
write = jsonReader.write(writer); | ||
|
||
if(write == ReadState.WRITE_SUCCEED) { | ||
// logger.debug("Wrote record."); | ||
recordCount++; | ||
}else{ | ||
// logger.debug("Exiting."); | ||
break outside; | ||
} | ||
|
||
outside: while(recordCount < DEFAULT_ROWS_PER_BATCH){ | ||
try | ||
{ | ||
writer.setPosition(recordCount); | ||
write = jsonReader.write(writer); | ||
if(write == ReadState.WRITE_SUCCEED) | ||
{ | ||
recordCount++; | ||
}else | ||
{ | ||
break outside; | ||
} | ||
} | ||
catch(Exception ex) | ||
{ | ||
++parseErrorCount; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the indentations seem to be off here as well as other places.. can you make sure the indentations match the rest of the code ? |
||
logger.error("Error parsing JSON in " + hadoopPath.getName() + " : line nos :" + (recordCount+parseErrorCount)); | ||
if(skipMalformedJSONRecords == false){ | ||
handleAndRaise("Error parsing JSON", ex);} | ||
} | ||
|
||
jsonReader.ensureAtLeastOneField(writer); | ||
|
||
writer.setValueCount(recordCount); | ||
// p.stop(); | ||
// System.out.println(String.format("Wrote %d records in %dms.", recordCount, p.elapsed(TimeUnit.MILLISECONDS))); | ||
|
||
updateRunningCount(); | ||
return recordCount; | ||
|
||
} catch (final Exception e) { | ||
handleAndRaise("Error parsing JSON", e); | ||
} | ||
// this is never reached | ||
return 0; | ||
jsonReader.ensureAtLeastOneField(writer); | ||
writer.setValueCount(recordCount); | ||
updateRunningCount(); | ||
return recordCount; | ||
} | ||
|
||
private void updateRunningCount() { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
import org.apache.drill.BaseTestQuery; | ||
import org.apache.drill.common.exceptions.UserException; | ||
import org.apache.drill.exec.proto.UserBitShared; | ||
import org.apache.drill.exec.ExecConstants; | ||
import org.junit.Test; | ||
import org.junit.Assert; | ||
|
||
|
@@ -179,4 +180,43 @@ public void testNestedFilter() throws Exception { | |
.sqlBaselineQuery(baselineQuery) | ||
.go(); | ||
} | ||
|
||
@Test // See DRILL-4653 | ||
public void testSkippingInvalidJSONRecords() throws Exception { | ||
try | ||
{ | ||
String set = "alter session set `" + ExecConstants.JSON_READER_SKIP_INVALID_RECORDS_FLAG+ "` = true"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these should be indented inside the try block with 2 spaces. It is best to set the indent level in your IDE (I can help with Eclipse if you are using it; if you are using IntelliJ I can find out from other developers using IntelliJ). |
||
String query = "select count(*) from cp.`jsoninput/DRILL-4653.json`"; | ||
testNoResult(set); | ||
testBuilder() | ||
.unOrdered() | ||
.sqlQuery(query) | ||
.sqlBaselineQuery(query) | ||
.build().run(); | ||
} | ||
finally | ||
{ | ||
String set = "alter session set `" + ExecConstants.JSON_READER_SKIP_INVALID_RECORDS_FLAG+ "` = false"; | ||
testNoResult(set); | ||
} | ||
} | ||
|
||
@Test // See DRILL-4653 | ||
public void testNotSkippingInvalidJSONRecords() throws Exception { | ||
try | ||
{ | ||
String query = "select count(*) from cp.`jsoninput/DRILL-4653.json`"; | ||
testBuilder() | ||
.unOrdered() | ||
.sqlQuery(query) | ||
.sqlBaselineQuery(query) | ||
.build().run(); | ||
} | ||
catch(Exception ex) | ||
{ | ||
// do nothing just return | ||
return; | ||
} | ||
throw new Exception("testNotSkippingInvalidJSONRecords"); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
{ "integer : 2010, | ||
"float" : 17.4, | ||
"x": { | ||
"y": "kevin", | ||
"z": "paul" | ||
}, | ||
"z": [ | ||
{"orange" : "yellow" , "pink": "red"}, | ||
{"pink" : "purple" } | ||
], | ||
"l": [4,2], | ||
"rl": [ [2,1], [4,6] ] | ||
} | ||
{ "integer : -2002, | ||
"float" : -1.2 | ||
} | ||
{ "integer" : 2001, | ||
"float" : 1.2, | ||
"x": { | ||
"y": "bill", | ||
"z": "peter" | ||
}, | ||
"z": [ | ||
{"pink" : "lilac" } | ||
], | ||
"l": [4,2], | ||
"rl": [ [2,1], [4,6] ] | ||
} | ||
{ "integer" : 6005, | ||
"float" : 1.2, | ||
"x": { | ||
"y": "mike", | ||
"z": "mary" | ||
}, | ||
"z": [ | ||
{"orange" : "stucco" } | ||
], | ||
"l": [4,2], | ||
"rl": [ [2,1], [4,6] ] | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems this is still doing indent of 4. We use 2 spaces (see https://drill.apache.org/docs/apache-drill-contribution-guidelines/ scroll down to Step 2). Did it pass the mvn command line build without checkstyle violations ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Aman,
maven checkstyle:checkstyle did not report any errors before I did my last
check in. I have changed to reflect 2 spaces for indendation.
On Thu, Jun 16, 2016 at 2:22 PM, Aman Sinha notifications@github.com
wrote: