-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PARQUET-227 Enforce that unions have only 1 set value, tolerate bad r…
…ecords in read path See https://issues.apache.org/jira/browse/PARQUET-227 Author: Alex Levenson <alexlevenson@twitter.com> Closes #153 from isnotinvain/alexlevenson/double-union and squashes the following commits: ef4d36f [Alex Levenson] fix package names e201deb [Alex Levenson] Merge branch 'master' into alexlevenson/double-union 01694fa [Alex Levenson] Forgot a break in a switch statement 2f31321 [Alex Levenson] Merge branch 'master' into alexlevenson/double-union 9292274 [Alex Levenson] Add in ShouldNeverHappenException which I forgot to check in 8d61515 [Alex Levenson] Address first round of comments 4d71bcb [Alex Levenson] Merge branch 'master' into alexlevenson/double-union 8f9334c [Alex Levenson] Some cleanup and fixes 8153bc9 [Alex Levenson] Enforce that unions have only 1 set value, tolerate bad records in read path
- Loading branch information
1 parent
b287d35
commit 9993450
Showing
15 changed files
with
665 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
40 changes: 40 additions & 0 deletions
40
parquet-common/src/main/java/org/apache/parquet/ShouldNeverHappenException.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.parquet; | ||
|
||
/** | ||
* Used in code blocks that should be unreachable, but the compiler does | ||
* not know this, for example the default clause of an exhaustive switch statement. | ||
*/ | ||
public class ShouldNeverHappenException extends ParquetRuntimeException { | ||
public ShouldNeverHappenException() { | ||
} | ||
|
||
public ShouldNeverHappenException(String message, Throwable cause) { | ||
super(message, cause); | ||
} | ||
|
||
public ShouldNeverHappenException(String message) { | ||
super(message); | ||
} | ||
|
||
public ShouldNeverHappenException(Throwable cause) { | ||
super(cause); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
87 changes: 87 additions & 0 deletions
87
parquet-hadoop/src/main/java/org/apache/parquet/hadoop/UnmaterializableRecordCounter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.parquet.hadoop; | ||
|
||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.parquet.Log; | ||
import org.apache.parquet.io.ParquetDecodingException; | ||
import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; | ||
|
||
// Essentially taken from: | ||
// https://github.com/twitter/elephant-bird/blob/master/core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoRecordReader.java#L124 | ||
|
||
/** | ||
* Tracks number of records that cannot be materialized and throws ParquetDecodingException | ||
* if the rate of errors crosses a limit.<p> These types of errors are meant | ||
* to be recoverable record conversion errors, such as a union missing a value, or schema | ||
* mismatch and so on. It's not meant to recover from corruptions in the parquet | ||
* columns themselves. | ||
* | ||
* The intention is to skip over very rare file corruption or bugs where | ||
* the write path has allowed invalid records into the file, but still catch large | ||
* numbers of failures. Not turned on by default (by default, no errors are tolerated). | ||
*/ | ||
public class UnmaterializableRecordCounter { | ||
|
||
/* Tolerated percent bad records */ | ||
public static final String BAD_RECORD_THRESHOLD_CONF_KEY = "parquet.read.bad.record.threshold"; | ||
|
||
private static final Log LOG = Log.getLog(UnmaterializableRecordCounter.class); | ||
|
||
private static final float DEFAULT_THRESHOLD = 0f; | ||
|
||
private long numErrors; | ||
|
||
private final double errorThreshold; // max fraction of errors allowed | ||
private final long totalNumRecords; // how many records are we going to see total? | ||
|
||
public UnmaterializableRecordCounter(Configuration conf, long totalNumRecords) { | ||
this( | ||
conf.getFloat(BAD_RECORD_THRESHOLD_CONF_KEY, DEFAULT_THRESHOLD), | ||
totalNumRecords | ||
); | ||
} | ||
|
||
public UnmaterializableRecordCounter(double errorThreshold, long totalNumRecords) { | ||
this.errorThreshold = errorThreshold; | ||
this.totalNumRecords = totalNumRecords; | ||
numErrors = 0; | ||
} | ||
|
||
public void incErrors(RecordMaterializationException cause) throws ParquetDecodingException { | ||
numErrors++; | ||
|
||
LOG.warn(String.format("Error while reading an input record (%s out of %s): ", | ||
numErrors, totalNumRecords), cause); | ||
|
||
if (numErrors > 0 && errorThreshold <= 0) { // no errors are tolerated | ||
throw new ParquetDecodingException("Error while decoding records", cause); | ||
} | ||
|
||
double errRate = numErrors/(double)totalNumRecords; | ||
|
||
if (errRate > errorThreshold) { | ||
String message = String.format("Decoding error rate of at least %s/%s crosses configured threshold of %s", | ||
numErrors, totalNumRecords, errorThreshold); | ||
LOG.error(message); | ||
throw new ParquetDecodingException(message, cause); | ||
} | ||
} | ||
} |
69 changes: 69 additions & 0 deletions
69
parquet-scrooge/src/test/java/org/apache/parquet/scrooge/TestCorruptScroogeRecords.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.apache.parquet.scrooge; | ||
|
||
import java.io.ByteArrayInputStream; | ||
import java.io.ByteArrayOutputStream; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
import org.apache.hadoop.fs.Path; | ||
import org.apache.hadoop.mapreduce.Job; | ||
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; | ||
import org.apache.thrift.protocol.TBinaryProtocol.Factory; | ||
import org.apache.thrift.protocol.TProtocol; | ||
import org.apache.thrift.transport.TIOStreamTransport; | ||
|
||
import org.apache.parquet.hadoop.thrift.TestCorruptThriftRecords; | ||
import org.apache.parquet.hadoop.thrift.ThriftReadSupport; | ||
import org.apache.parquet.scrooge.test.StructWithUnionV2; | ||
import org.apache.parquet.scrooge.test.StructWithUnionV2$; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
public class TestCorruptScroogeRecords extends TestCorruptThriftRecords { | ||
|
||
@Override | ||
public void setupJob(Job job, Path path) throws Exception { | ||
job.setInputFormatClass(ParquetScroogeInputFormat.class); | ||
ParquetScroogeInputFormat.setInputPaths(job, path); | ||
ParquetScroogeInputFormat.setThriftClass(job.getConfiguration(), StructWithUnionV2.class); | ||
|
||
|
||
ThriftReadSupport.setRecordConverterClass(job.getConfiguration(), ScroogeRecordConverter.class); | ||
|
||
job.setMapperClass(ReadMapper.class); | ||
job.setNumReduceTasks(0); | ||
job.setOutputFormatClass(NullOutputFormat.class); | ||
} | ||
|
||
@Override | ||
protected void assertEqualsExcepted(List<org.apache.parquet.thrift.test.compat.StructWithUnionV2> expected, List<Object> found) throws Exception { | ||
List<StructWithUnionV2> scroogeExpected = new ArrayList<StructWithUnionV2>(); | ||
for (org.apache.parquet.thrift.test.compat.StructWithUnionV2 tbase : expected) { | ||
ByteArrayOutputStream baos = new ByteArrayOutputStream(); | ||
TProtocol out = new Factory().getProtocol(new TIOStreamTransport(baos)); | ||
tbase.write(out); | ||
TProtocol in = new Factory().getProtocol(new TIOStreamTransport(new ByteArrayInputStream(baos.toByteArray()))); | ||
scroogeExpected.add(StructWithUnionV2$.MODULE$.decode(in)); | ||
} | ||
assertEquals(scroogeExpected, found); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.