From 014ee16da477fe21c498c7037366f046511a233c Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Tue, 6 Dec 2016 09:34:55 -0800 Subject: [PATCH] ORC-116. Add examples of reading and writing ORC files from Java. Fixes #269 Signed-off-by: Owen O'Malley --- java/core/pom.xml | 1 - java/examples/pom.xml | 116 ++++++++++++++++++ java/examples/src/assembly/uber.xml | 33 +++++ java/examples/src/findbugs/exclude.xml | 19 +++ .../apache/orc/examples/AdvancedWriter.java | 98 +++++++++++++++ .../org/apache/orc/examples/CoreReader.java | 68 ++++++++++ .../org/apache/orc/examples/CoreWriter.java | 63 ++++++++++ .../java/org/apache/orc/examples/Driver.java | 104 ++++++++++++++++ java/pom.xml | 37 ++++-- java/shims/pom.xml | 6 + 10 files changed, 534 insertions(+), 11 deletions(-) create mode 100644 java/examples/pom.xml create mode 100644 java/examples/src/assembly/uber.xml create mode 100644 java/examples/src/findbugs/exclude.xml create mode 100644 java/examples/src/java/org/apache/orc/examples/AdvancedWriter.java create mode 100644 java/examples/src/java/org/apache/orc/examples/CoreReader.java create mode 100644 java/examples/src/java/org/apache/orc/examples/CoreWriter.java create mode 100644 java/examples/src/java/org/apache/orc/examples/Driver.java diff --git a/java/core/pom.xml b/java/core/pom.xml index bb35070f27..dc3655c024 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -58,7 +58,6 @@ org.apache.hadoop hadoop-hdfs - provided org.apache.hive diff --git a/java/examples/pom.xml b/java/examples/pom.xml new file mode 100644 index 0000000000..762709c303 --- /dev/null +++ b/java/examples/pom.xml @@ -0,0 +1,116 @@ + + + + 4.0.0 + + org.apache.orc + orc + 1.6.0-SNAPSHOT + ../pom.xml + + + orc-examples + jar + ORC Examples + + + + + + + + org.apache.orc + orc-core + + + + + com.google.guava + guava + runtime + + + commons-cli + commons-cli + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + org.apache.hadoop + hadoop-hdfs + ${hadoop.version} + + + org.apache.hive + hive-storage-api + + + + + ${basedir}/src/java + ${basedir}/src/test + + + ${basedir}/src/test/resources + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + org.apache.maven.plugins + maven-compiler-plugin + + + maven-assembly-plugin + + + + org.apache.orc.examples.Driver + + + + src/assembly/uber.xml + + + + + make-assembly + package + + single + + + + + + + + + + cmake + + ${build.dir}/examples + + + + diff --git a/java/examples/src/assembly/uber.xml b/java/examples/src/assembly/uber.xml new file mode 100644 index 0000000000..014eab951b --- /dev/null +++ b/java/examples/src/assembly/uber.xml @@ -0,0 +1,33 @@ + + + uber + + jar + + false + + + / + true + true + runtime + + + + + metaInf-services + + + diff --git a/java/examples/src/findbugs/exclude.xml b/java/examples/src/findbugs/exclude.xml new file mode 100644 index 0000000000..fd215c0e8f --- /dev/null +++ b/java/examples/src/findbugs/exclude.xml @@ -0,0 +1,19 @@ + + + + + + + diff --git a/java/examples/src/java/org/apache/orc/examples/AdvancedWriter.java b/java/examples/src/java/org/apache/orc/examples/AdvancedWriter.java new file mode 100644 index 0000000000..ced67d3cf3 --- /dev/null +++ b/java/examples/src/java/org/apache/orc/examples/AdvancedWriter.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.examples; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +/** + * This example shows how to write compound data types in ORC. + * + */ +public class AdvancedWriter { + public static void main(Configuration conf, String[] args) throws IOException { + Path testFilePath = new Path("advanced-example.orc"); + + TypeDescription schema = + TypeDescription.fromString("struct>"); + + Writer writer = + OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema)); + + VectorizedRowBatch batch = schema.createRowBatch(); + LongColumnVector first = (LongColumnVector) batch.cols[0]; + LongColumnVector second = (LongColumnVector) batch.cols[1]; + + //Define map. You need also to cast the key and value vectors + MapColumnVector map = (MapColumnVector) batch.cols[2]; + BytesColumnVector mapKey = (BytesColumnVector) map.keys; + LongColumnVector mapValue = (LongColumnVector) map.values; + + // Each map has 5 elements + final int MAP_SIZE = 5; + final int BATCH_SIZE = batch.getMaxSize(); + + // Ensure the map is big enough + mapKey.ensureSize(BATCH_SIZE * MAP_SIZE, false); + mapValue.ensureSize(BATCH_SIZE * MAP_SIZE, false); + + // add 1500 rows to file + for(int r=0; r < 1500; ++r) { + int row = batch.size++; + + first.vector[row] = r; + second.vector[row] = r * 3; + + map.offsets[row] = map.childCount; + map.lengths[row] = MAP_SIZE; + map.childCount += MAP_SIZE; + + for (int mapElem = (int) map.offsets[row]; + mapElem < map.offsets[row] + MAP_SIZE; ++mapElem) { + String key = "row " + r + "." + (mapElem - map.offsets[row]); + mapKey.setVal(mapElem, key.getBytes(StandardCharsets.UTF_8)); + mapValue.vector[mapElem] = mapElem; + } + if (row == BATCH_SIZE - 1) { + writer.addRowBatch(batch); + batch.reset(); + } + } + if (batch.size != 0) { + writer.addRowBatch(batch); + batch.reset(); + } + writer.close(); } + + public static void main(String[] args) throws IOException { + main(new Configuration(), args); + } +} \ No newline at end of file diff --git a/java/examples/src/java/org/apache/orc/examples/CoreReader.java b/java/examples/src/java/org/apache/orc/examples/CoreReader.java new file mode 100644 index 0000000000..7ee5f44027 --- /dev/null +++ b/java/examples/src/java/org/apache/orc/examples/CoreReader.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.examples; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; + +import java.io.IOException; + +public class CoreReader { + public static void main(Configuration conf, String[] args) throws IOException { + // Get the information from the file footer + Reader reader = OrcFile.createReader(new Path("my-file.orc"), + OrcFile.readerOptions(conf)); + System.out.println("File schema: " + reader.getSchema()); + System.out.println("Row count: " + reader.getNumberOfRows()); + + // Pick the schema we want to read using schema evolution + TypeDescription readSchema = + TypeDescription.fromString("struct"); + // Read the row data + VectorizedRowBatch batch = readSchema.createRowBatch(); + RecordReader rowIterator = reader.rows(reader.options() + .schema(readSchema)); + LongColumnVector z = (LongColumnVector) batch.cols[0]; + BytesColumnVector y = (BytesColumnVector) batch.cols[1]; + LongColumnVector x = (LongColumnVector) batch.cols[2]; + while (rowIterator.nextBatch(batch)) { + for(int row=0; row < batch.size; ++row) { + int zRow = z.isRepeating ? 0: row; + int xRow = x.isRepeating ? 0: row; + System.out.println("z: " + + (z.noNulls || !z.isNull[zRow] ? z.vector[zRow] : null)); + System.out.println("y: " + y.toString(row)); + System.out.println("x: " + + (x.noNulls || !x.isNull[xRow] ? x.vector[xRow] : null)); + } + } + rowIterator.close(); + } + + public static void main(String[] args) throws IOException { + main(new Configuration(), args); + } +} \ No newline at end of file diff --git a/java/examples/src/java/org/apache/orc/examples/CoreWriter.java b/java/examples/src/java/org/apache/orc/examples/CoreWriter.java new file mode 100644 index 0000000000..77bb2c0ff5 --- /dev/null +++ b/java/examples/src/java/org/apache/orc/examples/CoreWriter.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.examples; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +public class CoreWriter { + public static void main(Configuration conf, String[] args) throws IOException { + TypeDescription schema = + TypeDescription.fromString("struct"); + Writer writer = OrcFile.createWriter(new Path("my-file.orc"), + OrcFile.writerOptions(conf) + .setSchema(schema)); + VectorizedRowBatch batch = schema.createRowBatch(); + LongColumnVector x = (LongColumnVector) batch.cols[0]; + BytesColumnVector y = (BytesColumnVector) batch.cols[1]; + for(int r=0; r < 10000; ++r) { + int row = batch.size++; + x.vector[row] = r; + byte[] buffer = ("Last-" + (r * 3)).getBytes(StandardCharsets.UTF_8); + y.setRef(row, buffer, 0, buffer.length); + // If the batch is full, write it out and start over. + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + if (batch.size != 0) { + writer.addRowBatch(batch); + } + writer.close(); + } + + public static void main(String[] args) throws IOException { + main(new Configuration(), args); + } +} \ No newline at end of file diff --git a/java/examples/src/java/org/apache/orc/examples/Driver.java b/java/examples/src/java/org/apache/orc/examples/Driver.java new file mode 100644 index 0000000000..ca17b35857 --- /dev/null +++ b/java/examples/src/java/org/apache/orc/examples/Driver.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.examples; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.hadoop.conf.Configuration; + +import java.util.Map; +import java.util.Properties; + +/** + * Driver program for the java ORC examples. + */ +public class Driver { + + @SuppressWarnings("static-access") + static Options createOptions() { + Options result = new Options(); + + result.addOption("h", "help", false, "Print help message"); + result.addOption("D", "define", true, "Set a configuration property"); + return result; + } + + static class DriverOptions { + final CommandLine genericOptions; + final String command; + final String[] commandArgs; + + DriverOptions(String[] args) throws ParseException { + genericOptions = new DefaultParser().parse(createOptions(), args, true); + String[] unprocessed = genericOptions.getArgs(); + if (unprocessed.length == 0) { + command = null; + commandArgs = new String[0]; + } else { + command = unprocessed[0]; + if (genericOptions.hasOption('h')) { + commandArgs = new String[]{"-h"}; + } else { + commandArgs = new String[unprocessed.length - 1]; + System.arraycopy(unprocessed, 1, commandArgs, 0, commandArgs.length); + } + } + } + } + + public static void main(String[] args) throws Exception { + DriverOptions options = new DriverOptions(args); + + if (options.command == null) { + System.err.println("ORC Java Examples"); + System.err.println(); + System.err.println("usage: java -jar orc-examples-*.jar [--help]" + + " [--define X=Y] "); + System.err.println(); + System.err.println("Commands:"); + System.err.println(" write - write a sample ORC file"); + System.err.println(" read - read a sample ORC file"); + System.err.println(" write2 - write a sample ORC file with a map"); + System.err.println(); + System.err.println("To get more help, provide -h to the command"); + System.exit(1); + } + Configuration conf = new Configuration(); + String[] confSettings = options.genericOptions.getOptionValues("D"); + if (confSettings != null) { + for (String param : confSettings) { + String[] parts = param.split("=", 2); + conf.set(parts[0], parts[1]); + } + } + if ("read".equals(options.command)) { + CoreReader.main(conf, options.commandArgs); + } else if ("write".equals(options.command)) { + CoreWriter.main(conf, options.commandArgs); + } else if ("write2".equals(options.command)) { + AdvancedWriter.main(conf, options.commandArgs); + } else { + System.err.println("Unknown subcommand: " + options.command); + System.exit(1); + } + } +} diff --git a/java/pom.xml b/java/pom.xml index 67e20f84a8..d43466b11b 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -58,6 +58,7 @@ core mapreduce tools + examples @@ -324,19 +325,19 @@ org.codehaus.mojo findbugs-maven-plugin - + org.apache.rat apache-rat-plugin - - + + benchmark - bench + bench @@ -348,6 +349,16 @@ org.apache.orc orc-shims 1.6.0-SNAPSHOT + + + org.apache.hadoop + hadoop-common + + + org.apache.hadoop + hadoop-hdfs + + org.apache.orc @@ -392,8 +403,8 @@ 3.9 - commons-beanutils - commons-beanutils + commons-beanutils + commons-beanutils @@ -418,8 +429,8 @@ 0.10 - io.airlift - slice + io.airlift + slice @@ -442,8 +453,8 @@ jersey-json - commons-beanutils - commons-beanutils-core + commons-beanutils + commons-beanutils-core commons-daemon @@ -585,6 +596,12 @@ org.apache.hive hive-storage-api ${storage-api.version} + + + org.apache.hadoop + hadoop-hdfs + + org.apache.zookeeper diff --git a/java/shims/pom.xml b/java/shims/pom.xml index 5777ff9b23..b6d3b730e4 100644 --- a/java/shims/pom.xml +++ b/java/shims/pom.xml @@ -40,11 +40,17 @@ org.apache.hadoop hadoop-common ${hadoop.version} + provided org.apache.hadoop hadoop-hdfs ${hadoop.version} + provided + + + org.slf4j + slf4j-api