From f0ec71231768e254d752f0a0c0817ec6468af9b7 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 11 Apr 2018 13:40:22 -0700 Subject: [PATCH 1/2] add docs for c++ --- site/_data/docs.yml | 4 +- site/_docs/core-cpp.md | 278 +++++++++++++++++++++++++ site/_docs/cpp-tools.md | 268 ++++++++++++++++++++++++ site/_docs/{tools.md => java-tools.md} | 97 +-------- 4 files changed, 558 insertions(+), 89 deletions(-) create mode 100644 site/_docs/core-cpp.md create mode 100644 site/_docs/cpp-tools.md rename site/_docs/{tools.md => java-tools.md} (76%) diff --git a/site/_data/docs.yml b/site/_data/docs.yml index 70087af7c7..9730ac3eba 100644 --- a/site/_data/docs.yml +++ b/site/_data/docs.yml @@ -24,10 +24,12 @@ - title: Using ORC Core docs: - core-java + - core-cpp - title: Tools docs: - - tools + - cpp-tools + - java-tools - title: Format Specification docs: diff --git a/site/_docs/core-cpp.md b/site/_docs/core-cpp.md new file mode 100644 index 0000000000..038d0462c2 --- /dev/null +++ b/site/_docs/core-cpp.md @@ -0,0 +1,278 @@ +--- +layout: docs +title: Using Core C++ +permalink: /docs/core-cpp.html +--- + +The C++ Core ORC API reads and writes ORC files into its own +orc::ColumnVectorBatch vectorized classes. + +## Vectorized Row Batch + +Data is passed to ORC as instances of orc::ColumnVectorBatch +that contain the data a batch of rows. The focus is on speed and +accessing the data fields directly. `numElements` is the number +of rows. ColumnVectorBatch is the parent type of the different +kinds of columns and has some fields that are shared across +all of the column types. In particular, the `hasNulls` flag +if there is any null in this column for this batch. For columns +where `hasNulls == true` the `notNull` buffer is false if that +value is null. + +~~~ cpp +namespace orc { + struct ColumnVectorBatch { + // the number of current occupied slots + uint64_t numElements; + // an array of capacity length marking non-null values + DataBuffer notNull; + // whether there are any null values + bool hasNulls; + ... + } +} +~~~ + +The subtypes of ColumnVectorBatch are: + +| ORC Type | ColumnVectorBatch | +| -------- | ------------- | +| array | ListVectorBatch | +| binary | StringVectorBatch | +| bigint | LongVectorBatch | +| boolean | LongVectorBatch | +| char | StringVectorBatch | +| date | LongVectorBatch | +| decimal | Decimal64VectorBatch, Decimal128VectorBatch | +| double | DoubleVectorBatch | +| float | DoubleVectorBatch | +| int | LongVectorBatch | +| map | MapVectorBatch | +| smallint | LongVectorBatch | +| string | StringVectorBatch | +| struct | StructVectorBatch | +| timestamp | TimestampVectorBatch | +| tinyint | LongVectorBatch | +| uniontype | UnionVectorBatch | +| varchar | StringVectorBatch | + +LongVectorBatch handles all of the integer types (boolean, bigint, +date, int, smallint, and tinyint). The data is represented as a +buffer of int64_t where each value is sign-extended as necessary. + +~~~ cpp + struct LongVectorBatch: public ColumnVectorBatch { + DataBuffer data; + ... + }; +~~~ + +TimestampVectorBatch handles timestamp values. The data is +represented as two buffers of int64_t for seconds and nanoseconds +respectively. Note that we always assume data is in GMT timezone; +therefore it is user's responsibility to convert wall clock time +from local timezone to GMT. + +~~~ cpp + struct TimestampVectorBatch: public ColumnVectorBatch { + DataBuffer data; + DataBuffer nanoseconds; + ... + }; +~~~ + +DoubleVectorBatch handles all of the floating point types +(double, and float). The data is represented as a buffer of doubles. + +~~~ cpp + struct DoubleVectorBatch: public ColumnVectorBatch { + DataBuffer data; + ... + }; +~~~ + +Decimal64VectorBatch handles decimal columns with precision no +greater than 18. Decimal128VectorBatch handles the others. The data +is represented as a buffer of int64_t and orc::Int128 respectively. + +~~~ cpp + struct Decimal64VectorBatch: public ColumnVectorBatch { + DataBuffer values; + ... + }; + + struct Decimal128VectorBatch: public ColumnVectorBatch { + DataBuffer values; + ... + }; +~~~ + +StringVectorBatch handles all of the binary types (binary, +char, string, and varchar). The data is represented as a char* buffer, +and a length buffer. + +~~~ cpp + struct StringVectorBatch: public ColumnVectorBatch { + DataBuffer data; + DataBuffer length; + ... + }; +~~~ + +StructVectorBatch handles the struct columns and represents +the data as a buffer of `ColumnVectorBatch`. + +~~~ cpp + struct StructVectorBatch: public ColumnVectorBatch { + std::vector fields; + ... + }; +~~~ + +UnionVectorBatch handles the union columns. It uses `tags` +to indicate which subtype has the value and `offsets` indicates +the offset in child batch of that subtype. A individual +`ColumnVectorBatch` is used for each subtype. + +~~~ cpp + struct UnionVectorBatch: public ColumnVectorBatch { + DataBuffer tags; + DataBuffer offsets; + std::vector children; + ... + }; +~~~ + +ListVectorBatch handles the array columns and represents +the data as a buffer of integers for the offsets and a +`ColumnVectorBatch` for the children values. + +~~~ cpp + struct ListVectorBatch: public ColumnVectorBatch { + /** + * The offset of the first element of each list. + * The length of list i is startOffset[i+1] - startOffset[i]. + */ + DataBuffer offsets; + // the concatenated elements + ORC_UNIQUE_PTR elements; + ... + }; +~~~ + +MapVectorBatch handles the map columns and represents the data +as two arrays of integers for the offsets and two `ColumnVectorBatch`s +for the keys and values. + +~~~ cpp + struct MapVectorBatch: public ColumnVectorBatch { + /** + * The offset of the first element of each list. + * The length of list i is startOffset[i+1] - startOffset[i]. + */ + DataBuffer offsets; + ORC_UNIQUE_PTR keys; + ORC_UNIQUE_PTR elements; + ... + }; +~~~ + +## Writing ORC Files + +To write an ORC file, you need to include `OrcFile.hh` and define +the schema; then use `orc::OutputStream` and `orc::WriterOptions` +to create a `orc::Writer` with the desired filename. This example +sets the required schema parameter, but there are many other +options to control the ORC writer. + +~~~ cpp +ORC_UNIQUE_PTR outStream = + writeLocalFile("my-file.orc"); +ORC_UNIQUE_PTR schema( + Type::buildTypeFromString("struct")); +WriterOptions options; +ORC_UNIQUE_PTR writer = + createWriter(*schema, outStream.get(), options); +~~~ + +Now you need to create a row batch, set the data, and write it to the file +as the batch fills up. When the file is done, close the `Writer`. + +~~~ cpp +uint64_t batchSize = 1024, rowCount = 10000; +ORC_UNIQUE_PTR batch = + writer->createRowBatch(batchSize); +StructVectorBatch *root = + dynamic_cast(batch.get()); +LongVectorBatch *x = + dynamic_cast(root->fields[0]); +LongVectorBatch *y = + dynamic_cast(root->fields[1]); + +uint64_t rows = 0; +for (uint64_t i = 0; i < rowCount; ++i) { + x->data[rows] = i; + y->data[rows] = i * 3; + rows++; + + if (rows == batchSize) { + root->numElements = rows; + x->numElements = rows; + y->numElements = rows; + + writer->add(*batch); + rows = 0; + } +} + +if (rows != 0) { + root->numElements = rows; + x->numElements = rows; + y->numElements = rows; + + writer->add(*batch); + rows = 0; +} + +writer->close(); +~~~ + +## Reading ORC Files + +To read ORC files, include `OrcFile.hh` file to create a `orc::Reader` +that contains the metadata about the file. There are a few options to +the `orc::Reader`, but far fewer than the writer and none of them are +required. The reader has methods for getting the number of rows, +schema, compression, etc. from the file. + +~~~ cpp +ORC_UNIQUE_PTR inStream = + readLocalFile("my-file.orc"); +ReaderOptions options; +ORC_UNIQUE_PTR reader = + createReader(inStream, options); +~~~ + +To get the data, create a `orc::RowReader` object. By default, +the RowReader reads all rows and all columns, but there are +options to control the data that is read. + +~~~ cpp +RowReaderOptions rowReaderOptions; +ORC_UNIQUE_PTR rowReader = + reader->createRowReader(rowReaderOptions); +ORC_UNIQUE_PTR batch = + rowReader->createRowBatch(1024); +~~~ + +With a `orc::RowReader` the user can ask for the next batch until there +are no more left. The reader will stop the batch at certain boundaries, +so the returned batch may not be full, but it will always contain some rows. + +~~~ cpp +while (rowReader->next(*batch)) { + for (uint64_t r = 0; r < batch->numElements; ++r) { + ... process row r from batch + } +} +~~~ diff --git a/site/_docs/cpp-tools.md b/site/_docs/cpp-tools.md new file mode 100644 index 0000000000..d4d6e7570f --- /dev/null +++ b/site/_docs/cpp-tools.md @@ -0,0 +1,268 @@ +--- +layout: docs +title: C++ Tools +permalink: /docs/cpp-tools.html +--- + +## orc-contents + +Displays the contents of the ORC file as a JSON document. With the +`columns` argument only the selected columns are printed. + +~~~ shell +% orc-contents [--columns=1,2,...] +~~~ + +If you run it on the example file TestOrcFile.test1.orc, you'll see (without +the line breaks within each record): + +~~~ shell +% orc-contents examples/TestOrcFile.test1.orc +{"boolean1": false, "byte1": 1, "short1": 1024, "int1": 65536, \\ + "long1": 9223372036854775807, "float1": 1, "double1": -15, \\ + "bytes1": [0, 1, 2, 3, 4], "string1": "hi", "middle": \\ + {"list": [{"int1": 1, "string1": "bye"}, \\ + {"int1": 2, "string1": "sigh"}]}, \\ + "list": [{"int1": 3, "string1": "good"}, \\ + {"int1": 4, "string1": "bad"}], \\ + "map": []} +{"boolean1": true, "byte1": 100, "short1": 2048, "int1": 65536, + "long1": 9223372036854775807, "float1": 2, "double1": -5, \\ + "bytes1": [], "string1": "bye", \\ + "middle": {"list": [{"int1": 1, "string1": "bye"}, \\ + {"int1": 2, "string1": "sigh"}]}, \\ + "list": [{"int1": 100000000, "string1": "cat"}, \\ + {"int1": -100000, "string1": "in"}, \\ + {"int1": 1234, "string1": "hat"}], \\ + "map": [{"key": "chani", "value": {"int1": 5, "string1": "chani"}}, \\ + {"key": "mauddib", \\ + "value": {"int1": 1, "string1": "mauddib"}}]} +~~~ + +## orc-metadata + +Displays the metadata of the ORC file as a JSON document. With the +`verbose` option additional information about the layout of the file +is also printed. + +For diagnosing problems, it is useful to use the '--raw' option that +prints the protocol buffers from the ORC file directly rather than +interpreting them. + +~~~ shell +% orc-metadata [-v] [--raw] +~~~ + +If you run it on the example file TestOrcFile.test1.orc, you'll see: + +~~~ shell +% orc-metadata examples/TestOrcFile.test1.orc +{ "name": "../examples/TestOrcFile.test1.orc", + "type": "struct>>,list:array>,map:map< +string,struct>>", + "rows": 2, + "stripe count": 1, + "format": "0.12", "writer version": "HIVE-8732", + "compression": "zlib", "compression block": 10000, + "file length": 1711, + "content": 1015, "stripe stats": 250, "footer": 421, "postscript": 24, + "row index stride": 10000, + "user metadata": { + }, + "stripes": [ + { "stripe": 0, "rows": 2, + "offset": 3, "length": 1012, + "index": 570, "data": 243, "footer": 199 + } + ] +} +~~~ + +## csv-import + +Imports CSV file into an Orc file using the specified schema. +Compound types are not yet supported. `delimiter` option indicates +the delimiter in the input CSV file and by default is `,`. `stripe` +option means the stripe size and set to 128MB by default. `block` +option is compression block size which is 64KB by default. `batch` +option is by default 1024 rows for one batch. + +~~~ shell +% csv-import [--delimiter=] [--stripe=] + [--block=] [--batch=] + +~~~ + +If you run it on the example file TestCSVFileImport.test10rows.csv, +you'll see: + +~~~ shell +% csv-import "struct" + examples/TestCSVFileImport.test10rows.csv /tmp/test.orc +[2018-04-11 11:12:16] Start importing Orc file... +[2018-04-11 11:12:16] Finish importing Orc file. +[2018-04-11 11:12:16] Total writer elasped time: 0.001352s. +[2018-04-11 11:12:16] Total writer CPU time: 0.001339s. +~~~ + +## orc-scan + +Scans and displays the row count of the ORC file. With the `batch` option +to set the batch size which is 1024 rows by default. It is useful to check +if the ORC file is damaged. + +~~~ shell +% orc-scan [--batch=] +~~~ + +If you run it on the example file TestOrcFile.test1.orc, you'll see: + +~~~ shell +% orc-scan examples/TestOrcFile.test1.orc +Rows: 2 +Batches: 1 +~~~ + +## orc-statistics + +Displays the file-level and stripe-level column statistics of the ORC file. +With the `withIndex` option to include column statistics in each row group. + +~~~ shell +% orc-statistics [--withIndex] +~~~ + +If you run it on the example file TestOrcFile.TestOrcFile.columnProjection.orc +you'll see: + +~~~ shell +% orc-statistics examples/TestOrcFile.columnProjection.orc +File examples/TestOrcFile.columnProjection.orc has 3 columns +*** Column 0 *** +Column has 21000 values and has null value: no + +*** Column 1 *** +Data type: Integer +Values: 21000 +Has null: no +Minimum: -2147439072 +Maximum: 2147257982 +Sum: 268482658568 + +*** Column 2 *** +Data type: String +Values: 21000 +Has null: no +Minimum: 100119c272d7db89 +Maximum: fffe9f6f23b287f3 +Total length: 334559 + +File examples/TestOrcFile.columnProjection.orc has 5 stripes +*** Stripe 0 *** + +--- Column 0 --- +Column has 5000 values and has null value: no + +--- Column 1 --- +Data type: Integer +Values: 5000 +Has null: no +Minimum: -2145365268 +Maximum: 2147025027 +Sum: -29841423854 + +--- Column 2 --- +Data type: String +Values: 5000 +Has null: no +Minimum: 1005350489418be2 +Maximum: fffbb8718c92b09f +Total length: 79644 + +*** Stripe 1 *** + +--- Column 0 --- +Column has 5000 values and has null value: no + +--- Column 1 --- +Data type: Integer +Values: 5000 +Has null: no +Minimum: -2147115959 +Maximum: 2147257982 +Sum: 108604887785 + +--- Column 2 --- +Data type: String +Values: 5000 +Has null: no +Minimum: 100119c272d7db89 +Maximum: fff0ae41d41e6afc +Total length: 79640 + +*** Stripe 2 *** + +--- Column 0 --- +Column has 5000 values and has null value: no + +--- Column 1 --- +Data type: Integer +Values: 5000 +Has null: no +Minimum: -2145932387 +Maximum: 2145877119 +Sum: 70064190848 + +--- Column 2 --- +Data type: String +Values: 5000 +Has null: no +Minimum: 10130af874ae036c +Maximum: fffe9f6f23b287f3 +Total length: 79645 + +*** Stripe 3 *** + +--- Column 0 --- +Column has 5000 values and has null value: no + +--- Column 1 --- +Data type: Integer +Values: 5000 +Has null: no +Minimum: -2147439072 +Maximum: 2147074354 +Sum: 104681356482 + +--- Column 2 --- +Data type: String +Values: 5000 +Has null: no +Minimum: 102547d48ed06518 +Maximum: fffa47c57dc7b69a +Total length: 79689 + +*** Stripe 4 *** + +--- Column 0 --- +Column has 1000 values and has null value: no + +--- Column 1 --- +Data type: Integer +Values: 1000 +Has null: no +Minimum: -2141222223 +Maximum: 2145816096 +Sum: 14973647307 + +--- Column 2 --- +Data type: String +Values: 1000 +Has null: no +Minimum: 1059d81c9025a217 +Maximum: ffc17f0e35e1a6c0 +Total length: 15941 +~~~ \ No newline at end of file diff --git a/site/_docs/tools.md b/site/_docs/java-tools.md similarity index 76% rename from site/_docs/tools.md rename to site/_docs/java-tools.md index 04ff4fdbae..38559553ed 100644 --- a/site/_docs/tools.md +++ b/site/_docs/java-tools.md @@ -1,90 +1,11 @@ --- layout: docs -title: Tools -permalink: /docs/tools.html +title: Java Tools +permalink: /docs/java-tools.html --- -## orc-contents - -Displays the contents of the ORC file as a JSON document. With the -`columns` argument only the selected columns are printed. - -~~~ shell -% orc-contents [--columns=1,2,...] -~~~ - -If you run it on the example file TestOrcFile.test1.orc, you'll see (without -the line breaks within each record): - -~~~ shell -% orc-contents examples/TestOrcFile.test1.orc -{"boolean1": false, "byte1": 1, "short1": 1024, "int1": 65536, \\ - "long1": 9223372036854775807, "float1": 1, "double1": -15, \\ - "bytes1": [0, 1, 2, 3, 4], "string1": "hi", "middle": \\ - {"list": [{"int1": 1, "string1": "bye"}, \\ - {"int1": 2, "string1": "sigh"}]}, \\ - "list": [{"int1": 3, "string1": "good"}, \\ - {"int1": 4, "string1": "bad"}], \\ - "map": []} -{"boolean1": true, "byte1": 100, "short1": 2048, "int1": 65536, - "long1": 9223372036854775807, "float1": 2, "double1": -5, \\ - "bytes1": [], "string1": "bye", \\ - "middle": {"list": [{"int1": 1, "string1": "bye"}, \\ - {"int1": 2, "string1": "sigh"}]}, \\ - "list": [{"int1": 100000000, "string1": "cat"}, \\ - {"int1": -100000, "string1": "in"}, \\ - {"int1": 1234, "string1": "hat"}], \\ - "map": [{"key": "chani", "value": {"int1": 5, "string1": "chani"}}, \\ - {"key": "mauddib", \\ - "value": {"int1": 1, "string1": "mauddib"}}]} -~~~ - -## orc-metadata - -Displays the metadata of the ORC file as a JSON document. With the -`verbose` option additional information about the layout of the file -is also printed. - -For diagnosing problems, it is useful to use the '--raw' option that -prints the protocol buffers from the ORC file directly rather than -interpreting them. - -~~~ shell -% orc-metadata [-v] [--raw] -~~~ - -If you run it on the example file TestOrcFile.test1.orc, you'll see: - -~~~ shell -% orc-metadata examples/TestOrcFile.test1.orc -{ "name": "../examples/TestOrcFile.test1.orc", - "type": "struct>>,list:array>,map:map< -string,struct>>", - "rows": 2, - "stripe count": 1, - "format": "0.12", "writer version": "HIVE-8732", - "compression": "zlib", "compression block": 10000, - "file length": 1711, - "content": 1015, "stripe stats": 250, "footer": 421, "postscript": 24, - "row index stride": 10000, - "user metadata": { - }, - "stripes": [ - { "stripe": 0, "rows": 2, - "offset": 3, "length": 1012, - "index": 570, "data": 243, "footer": 199 - } - ] -} -~~~ - -## Java ORC Tools - -In addition to the C++ tools above, there is an ORC tools jar that -packages several useful utilities and the necessary Java dependencies +In addition to the C++ tools, there is an ORC tools jar that packages +several useful utilities and the necessary Java dependencies (including Hadoop) into a single package. The Java ORC tool jar supports both the local file system and HDFS. @@ -102,7 +23,7 @@ The command line looks like: % java -jar orc-tools-X.Y.Z-uber.jar ~~~ -### Java Meta +## Java Meta The meta command prints the metadata about the given ORC file and is equivalent to the Hive ORC File Dump command. @@ -280,19 +201,19 @@ Padding ratio: 0% ______________________________________________________________________ ~~~ -### Java Data +## Java Data The data command prints the data in an ORC file as a JSON document. Each record is printed as a JSON object on a line. Each record is annotated with the fieldnames and a JSON representation that depends on the field's type. -### Java Scan +## Java Scan The scan command reads the contents of the file without printing anything. It is primarily intendend for benchmarking the Java reader without including the cost of printing the data out. -### Java Convert +## Java Convert The convert command reads several JSON files and converts them into a single ORC file. @@ -309,7 +230,7 @@ single ORC file. The automatic JSON schema discovery is equivalent to the json-schema tool below. -### Java JSON Schema +## Java JSON Schema The JSON Schema discovery tool processes a set of JSON documents and produces a schema that encompasses all of the records in all of the From 69dbdcb23896e1b20ca6037071102c25af22600a Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 11 Apr 2018 13:43:08 -0700 Subject: [PATCH 2/2] remove comments in code block --- site/_docs/core-cpp.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/site/_docs/core-cpp.md b/site/_docs/core-cpp.md index 038d0462c2..4b9f683252 100644 --- a/site/_docs/core-cpp.md +++ b/site/_docs/core-cpp.md @@ -22,11 +22,8 @@ value is null. ~~~ cpp namespace orc { struct ColumnVectorBatch { - // the number of current occupied slots uint64_t numElements; - // an array of capacity length marking non-null values DataBuffer notNull; - // whether there are any null values bool hasNulls; ... } @@ -149,12 +146,7 @@ the data as a buffer of integers for the offsets and a ~~~ cpp struct ListVectorBatch: public ColumnVectorBatch { - /** - * The offset of the first element of each list. - * The length of list i is startOffset[i+1] - startOffset[i]. - */ DataBuffer offsets; - // the concatenated elements ORC_UNIQUE_PTR elements; ... }; @@ -166,10 +158,6 @@ for the keys and values. ~~~ cpp struct MapVectorBatch: public ColumnVectorBatch { - /** - * The offset of the first element of each list. - * The length of list i is startOffset[i+1] - startOffset[i]. - */ DataBuffer offsets; ORC_UNIQUE_PTR keys; ORC_UNIQUE_PTR elements;