New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DRILL-4653.json - Malformed JSON should not stop the entire query from progressing #518
Changes from 1 commit
4fc70fa
8d5e059
56a16fe
e5a9a5b
5d5bca4
0ed75ef
6ed7330
45de6f7
8b492eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,6 +64,8 @@ public class JSONRecordReader extends AbstractRecordReader { | |
private final boolean enableAllTextMode; | ||
private final boolean readNumbersAsDouble; | ||
private final boolean unionEnabled; | ||
private int parseErrorCount; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be long instead of int since parseErrorCount is cumulative..so in the worst case it could be as large as runningRecordCount. |
||
private final boolean skipMalformedJSONRecords; | ||
|
||
/** | ||
* Create a JSON Record Reader that uses a file based input stream. | ||
|
@@ -114,6 +116,7 @@ private JSONRecordReader(final FragmentContext fragmentContext, final String inp | |
this.enableAllTextMode = embeddedContent == null && fragmentContext.getOptions().getOption(ExecConstants.JSON_READER_ALL_TEXT_MODE_VALIDATOR); | ||
this.readNumbersAsDouble = embeddedContent == null && fragmentContext.getOptions().getOption(ExecConstants.JSON_READ_NUMBERS_AS_DOUBLE_VALIDATOR); | ||
this.unionEnabled = embeddedContent == null && fragmentContext.getOptions().getOption(ExecConstants.ENABLE_UNION_TYPE); | ||
this.skipMalformedJSONRecords = fragmentContext.getOptions().getOption(ExecConstants.JSON_SKIP_MALFORMED_RECORDS_VALIDATOR); | ||
setColumns(columns); | ||
} | ||
|
||
|
@@ -122,7 +125,8 @@ public String toString() { | |
return super.toString() | ||
+ "[hadoopPath = " + hadoopPath | ||
+ ", recordCount = " + recordCount | ||
+ ", runningRecordCount = " + runningRecordCount + ", ...]"; | ||
+ ", parseErrorCount = " + parseErrorCount | ||
+ ", runningRecordCount = " + runningRecordCount + ", ...]"; | ||
} | ||
|
||
@Override | ||
|
@@ -189,26 +193,33 @@ private long currentRecordNumberInFile() { | |
public int next() { | ||
writer.allocate(); | ||
writer.reset(); | ||
|
||
recordCount = 0; | ||
ReadState write = null; | ||
// Stopwatch p = new Stopwatch().start(); | ||
try{ | ||
outside: while(recordCount < DEFAULT_ROWS_PER_BATCH) { | ||
writer.setPosition(recordCount); | ||
write = jsonReader.write(writer); | ||
|
||
if(write == ReadState.WRITE_SUCCEED) { | ||
// try | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove these commented lines |
||
// { | ||
outside: while(recordCount < DEFAULT_ROWS_PER_BATCH){ | ||
try{ | ||
writer.setPosition(recordCount); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems this is still doing indent of 4. We use 2 spaces (see https://drill.apache.org/docs/apache-drill-contribution-guidelines/ scroll down to Step 2). Did it pass the mvn command line build without checkstyle violations ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Aman, On Thu, Jun 16, 2016 at 2:22 PM, Aman Sinha notifications@github.com
|
||
write = jsonReader.write(writer); | ||
if(write == ReadState.WRITE_SUCCEED) { | ||
// logger.debug("Wrote record."); | ||
recordCount++; | ||
}else{ | ||
recordCount++; | ||
}else{ | ||
// logger.debug("Exiting."); | ||
break outside; | ||
} | ||
|
||
break outside; | ||
} | ||
} | ||
catch(Exception ex) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor style convention: can you put the catch() on the previous line to match the closing paren |
||
{ | ||
if(skipMalformedJSONRecords == false){ | ||
handleAndRaise("Error parsing JSON", ex); | ||
} | ||
++parseErrorCount; | ||
} | ||
} | ||
|
||
jsonReader.ensureAtLeastOneField(writer); | ||
jsonReader.ensureAtLeastOneField(writer); | ||
|
||
writer.setValueCount(recordCount); | ||
// p.stop(); | ||
|
@@ -217,11 +228,11 @@ public int next() { | |
updateRunningCount(); | ||
return recordCount; | ||
|
||
} catch (final Exception e) { | ||
handleAndRaise("Error parsing JSON", e); | ||
} | ||
// } catch (final Exception e) { | ||
// handleAndRaise("Error parsing JSON", e); | ||
// } | ||
// this is never reached | ||
return 0; | ||
//return 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uncomment this (best practice since function has a return type) |
||
} | ||
|
||
private void updateRunningCount() { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
import org.apache.drill.BaseTestQuery; | ||
import org.apache.drill.common.exceptions.UserException; | ||
import org.apache.drill.exec.proto.UserBitShared; | ||
import org.apache.drill.exec.ExecConstants; | ||
import org.junit.Test; | ||
import org.junit.Assert; | ||
|
||
|
@@ -179,4 +180,27 @@ public void testNestedFilter() throws Exception { | |
.sqlBaselineQuery(baselineQuery) | ||
.go(); | ||
} | ||
|
||
@Test // See DRILL-4653 | ||
public void testSkippingInvalidJSONRecords() throws Exception { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For both these tests could you pls use the testBuilder() framework ? This is the recommended way to write the unit tests .. you can see one of the other tests in this file. |
||
String set = "alter session set `" + ExecConstants.JSON_READER_SKIP_MALFORMED_RECORDS_FLAG+ "` = true"; | ||
testNoResult(set); | ||
test("select count(*) from cp.`jsoninput/DRILL-4653.json`"); | ||
set = "alter session set `" + ExecConstants.JSON_READER_SKIP_MALFORMED_RECORDS_FLAG+ "` = false"; | ||
testNoResult(set); | ||
} | ||
|
||
@Test // See DRILL-4653 | ||
public void testNotSkippingInvalidJSONRecords() throws Exception { | ||
try | ||
{ | ||
test("select count(*) from cp.`jsoninput/DRILL-4653.json`"); | ||
} | ||
catch(Exception ex) | ||
{ | ||
// do nothing just return | ||
return; | ||
} | ||
throw new Exception("testNotSkippingInvalidJSONRecords"); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
{ "integer" : 2010, | ||
"float" : 17.4, | ||
"x": { | ||
"y": "kevin", | ||
"z": "paul" | ||
}, | ||
"z": [ | ||
{"orange" : "yellow" , "pink": "red"}, | ||
{"pink" : "purple" } | ||
], | ||
"l": [4,2], | ||
"rl": [ [2,1], [4,6] ] | ||
} | ||
{ "integer : -2002, | ||
"float" : -1.2 | ||
} | ||
{ "integer" : 2001, | ||
"float" : 1.2, | ||
"x": { | ||
"y": "bill", | ||
"z": "peter" | ||
}, | ||
"z": [ | ||
{"pink" : "lilac" } | ||
], | ||
"l": [4,2], | ||
"rl": [ [2,1], [4,6] ] | ||
} | ||
{ "integer" : 6005, | ||
"float" : 1.2, | ||
"x": { | ||
"y": "mike", | ||
"z": "mary" | ||
}, | ||
"z": [ | ||
{"orange" : "stucco" } | ||
], | ||
"l": [4,2], | ||
"rl": [ [2,1], [4,6] ] | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you change this to 'skip_invalid_records' such that the name is somewhat consistent with the future similar option in DRILL-3764. In the future the json option would likely be subsumed by the new global option.