From 09e9e1240d5cecca8f1811ea789931411aef8582 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Thu, 28 Sep 2017 11:47:01 -0700 Subject: [PATCH] ORC-241. Make an unknown file format an exception. Fixes #173 Signed-off-by: Owen O'Malley --- .../apache/orc/UnknownFormatException.java | 52 +++++++++++++++++++ .../java/org/apache/orc/impl/ReaderImpl.java | 41 ++++++--------- .../org/apache/orc/TestVectorOrcFile.java | 15 ++++++ 3 files changed, 82 insertions(+), 26 deletions(-) create mode 100644 java/core/src/java/org/apache/orc/UnknownFormatException.java diff --git a/java/core/src/java/org/apache/orc/UnknownFormatException.java b/java/core/src/java/org/apache/orc/UnknownFormatException.java new file mode 100644 index 0000000000..9ebdae7cdf --- /dev/null +++ b/java/core/src/java/org/apache/orc/UnknownFormatException.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import com.google.protobuf.TextFormat; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + +public class UnknownFormatException extends IOException { + private final Path path; + private final String versionString; + private final OrcProto.PostScript postscript; + + public UnknownFormatException(Path path, String versionString, + OrcProto.PostScript postscript) { + super(path + " was written by a future ORC version " + + versionString + ". This file is not readable by this version of ORC.\n"+ + "Postscript: " + TextFormat.shortDebugString(postscript)); + this.path = path; + this.versionString = versionString; + this.postscript = postscript; + } + + public Path getPath() { + return path; + } + + public String getVersionString() { + return versionString; + } + + public OrcProto.PostScript getPostscript() { + return postscript; + } +} diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java index 130048bb5d..e3f45ec7c1 100644 --- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -21,7 +21,6 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; @@ -40,6 +39,7 @@ import org.apache.orc.FileFormatException; import org.apache.orc.StripeInformation; import org.apache.orc.StripeStatistics; +import org.apache.orc.UnknownFormatException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -135,7 +135,7 @@ public long getNumberOfRows() { @Override public List getMetadataKeys() { - List result = new ArrayList(); + List result = new ArrayList<>(); for(OrcProto.UserMetadataItem item: userMetadata) { result.add(item.getName()); } @@ -244,7 +244,6 @@ public TypeDescription getSchema() { * @param path the filename for error messages * @param psLen the postscript length * @param buffer the tail of the file - * @throws IOException */ protected static void ensureOrcFooter(FSDataInputStream in, Path path, @@ -277,7 +276,6 @@ protected static void ensureOrcFooter(FSDataInputStream in, * files or RC files as ORC files. * @param psLen the postscript length * @param buffer the tail of the file - * @throws IOException */ protected static void ensureOrcFooter(ByteBuffer buffer, int psLen) throws IOException { int magicLength = OrcFile.MAGIC.length(); @@ -317,25 +315,16 @@ private static String versionString(List version) { /** * Check to see if this ORC file is from a future version and if so, * warn the user that we may not be able to read all of the column encodings. - * @param log the logger to write any error message to * @param path the data source path for error messages - * @param version the version of hive that wrote the file. + * @param postscript the parsed postscript */ - protected static void checkOrcVersion(Logger log, Path path, - List version) { - if (version.size() >= 1) { - int major = version.get(0); - int minor = 0; - if (version.size() >= 2) { - minor = version.get(1); - } - if (major > OrcFile.Version.CURRENT.getMajor() || - (major == OrcFile.Version.CURRENT.getMajor() && - minor > OrcFile.Version.CURRENT.getMinor())) { - log.warn(path + " was written by a future Hive version " + - versionString(version) + - ". This file may not be readable by this version of Hive."); - } + protected static void checkOrcVersion(Path path, + OrcProto.PostScript postscript + ) throws IOException { + List version = postscript.getVersionList(); + if (getFileVersion(version) == OrcFile.Version.FUTURE) { + throw new UnknownFormatException(path, versionString(version), + postscript); } } @@ -343,7 +332,6 @@ protected static void checkOrcVersion(Logger log, Path path, * Constructor that let's the user specify additional options. * @param path pathname for file * @param options options for reading - * @throws IOException */ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { FileSystem fs = options.getFilesystem(); @@ -378,6 +366,7 @@ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { tail = extractFileTail(fs, path, options.getMaxLength()); options.orcTail(tail); } else { + checkOrcVersion(path, orcTail.getPostScript()); tail = orcTail; } this.compressionKind = tail.getCompressionKind(); @@ -440,7 +429,7 @@ private static OrcProto.PostScript extractPostScript(ByteBuffer bb, Path path, CodedInputStream in = CodedInputStream.newInstance( bb.array(), bb.arrayOffset() + psAbsOffset, psLen); OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in); - checkOrcVersion(LOG, path, ps.getVersionList()); + checkOrcVersion(path, ps); // Check compression codec. switch (ps.getCompression()) { @@ -472,7 +461,7 @@ public static OrcTail extractFileTail(ByteBuffer buffer, long fileLength, long m OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(psBuffer); int footerSize = (int) ps.getFooterLength(); CompressionKind kind = CompressionKind.valueOf(ps.getCompression().name()); - OrcProto.FileTail.Builder fileTailBuilder = null; + OrcProto.FileTail.Builder fileTailBuilder; CompressionCodec codec = OrcCodecPool.getCodec(kind); try { OrcProto.Footer footer = extractFooter(buffer, @@ -603,7 +592,7 @@ protected OrcTail extractFileTail(FileSystem fs, Path path, buffer.position(footerOffset); ByteBuffer footerBuffer = buffer.slice(); buffer.reset(); - OrcProto.Footer footer = null; + OrcProto.Footer footer; CompressionCodec codec = OrcCodecPool.getCodec(compressionKind); try { footer = extractFooter(footerBuffer, 0, footerSize, codec, bufferSize); diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index 6fe132b711..62e3c05460 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -3274,4 +3274,19 @@ public void testZeroByteOrcFile() throws Exception { TypeDescription.fromString("struct<>").createRowBatch(); assertEquals(false, reader.rows().nextBatch(batch)); } + + @Test + public void testFutureOrcFile() throws Exception { + Path zeroFile = new Path(exampleDir, "version1999.orc"); + try { + Reader reader = OrcFile.createReader(zeroFile, OrcFile.readerOptions(conf)); + assertTrue("no exception for bad verion", false); + } catch (UnknownFormatException uf) { + assertEquals("path is correct", "version1999.orc", uf.getPath().getName()); + assertEquals("19.99", uf.getVersionString()); + OrcProto.PostScript ps = uf.getPostscript(); + assertEquals("ORC", ps.getMagic()); + assertEquals(OrcProto.CompressionKind.NONE, ps.getCompression()); + } + } }