From f0ec71231768e254d752f0a0c0817ec6468af9b7 Mon Sep 17 00:00:00 2001
From: Gang Wu <gang.w@alibaba-inc.com>
Date: Wed, 11 Apr 2018 13:40:22 -0700
Subject: [PATCH 1/2] add docs for c++

---
 site/_data/docs.yml                    |   4 +-
 site/_docs/core-cpp.md                 | 278 +++++++++++++++++++++++++
 site/_docs/cpp-tools.md                | 268 ++++++++++++++++++++++++
 site/_docs/{tools.md => java-tools.md} |  97 +--------
 4 files changed, 558 insertions(+), 89 deletions(-)
 create mode 100644 site/_docs/core-cpp.md
 create mode 100644 site/_docs/cpp-tools.md
 rename site/_docs/{tools.md => java-tools.md} (76%)
diff --git a/site/_data/docs.yml b/site/_data/docs.yml
index 70087af7c7..9730ac3eba 100644
--- a/site/_data/docs.yml
+++ b/site/_data/docs.yml
@@ -24,10 +24,12 @@
 - title: Using ORC Core
   docs:
   - core-java
+  - core-cpp
 
 - title: Tools
   docs:
-  - tools
+  - cpp-tools
+  - java-tools
 
 - title: Format Specification
   docs:
diff --git a/site/_docs/core-cpp.md b/site/_docs/core-cpp.md
new file mode 100644
index 0000000000..038d0462c2
--- /dev/null
+++ b/site/_docs/core-cpp.md
@@ -0,0 +1,278 @@
+---
+layout: docs
+title: Using Core C++
+permalink: /docs/core-cpp.html
+---
+
+The C++ Core ORC API reads and writes ORC files into its own
+orc::ColumnVectorBatch vectorized classes.
+
+## Vectorized Row Batch
+
+Data is passed to ORC as instances of orc::ColumnVectorBatch
+that contain the data a batch of rows. The focus is on speed and
+accessing the data fields directly. `numElements` is the number
+of rows. ColumnVectorBatch is the parent type of the different
+kinds of columns and has some fields that are shared across
+all of the column types. In particular, the `hasNulls` flag
+if there is any null in this column for this batch. For columns
+where `hasNulls == true` the `notNull` buffer is false if that
+value is null.
+
+~~~ cpp
+namespace orc {
+  struct ColumnVectorBatch {
+    // the number of current occupied slots
+    uint64_t numElements;
+    // an array of capacity length marking non-null values
+    DataBuffer<char> notNull;
+    // whether there are any null values
+    bool hasNulls;
+    ...
+  }
+}
+~~~
+
+The subtypes of ColumnVectorBatch are:
+
+| ORC Type | ColumnVectorBatch |
+| -------- | ------------- |
+| array | ListVectorBatch |
+| binary | StringVectorBatch |
+| bigint | LongVectorBatch |
+| boolean | LongVectorBatch |
+| char | StringVectorBatch |
+| date | LongVectorBatch |
+| decimal | Decimal64VectorBatch, Decimal128VectorBatch |
+| double | DoubleVectorBatch |
+| float | DoubleVectorBatch |
+| int | LongVectorBatch |
+| map | MapVectorBatch |
+| smallint | LongVectorBatch |
+| string | StringVectorBatch |
+| struct | StructVectorBatch |
+| timestamp | TimestampVectorBatch |
+| tinyint | LongVectorBatch |
+| uniontype | UnionVectorBatch |
+| varchar | StringVectorBatch |
+
+LongVectorBatch handles all of the integer types (boolean, bigint,
+date, int, smallint, and tinyint). The data is represented as a
+buffer of int64_t where each value is sign-extended as necessary.
+
+~~~ cpp
+  struct LongVectorBatch: public ColumnVectorBatch {
+    DataBuffer<int64_t> data;
+    ...
+  };
+~~~
+
+TimestampVectorBatch handles timestamp values. The data is
+represented as two buffers of int64_t for seconds and nanoseconds
+respectively. Note that we always assume data is in GMT timezone;
+therefore it is user's responsibility to convert wall clock time
+from local timezone to GMT.
+
+~~~ cpp
+  struct TimestampVectorBatch: public ColumnVectorBatch {
+    DataBuffer<int64_t> data;
+    DataBuffer<int64_t> nanoseconds;
+    ...
+  };
+~~~
+
+DoubleVectorBatch handles all of the floating point types
+(double, and float). The data is represented as a buffer of doubles.
+
+~~~ cpp
+  struct DoubleVectorBatch: public ColumnVectorBatch {
+    DataBuffer<double> data;
+    ...
+  };
+~~~
+
+Decimal64VectorBatch handles decimal columns with precision no
+greater than 18. Decimal128VectorBatch handles the others. The data
+is represented as a buffer of int64_t and orc::Int128 respectively.
+
+~~~ cpp
+  struct Decimal64VectorBatch: public ColumnVectorBatch {
+    DataBuffer<int64_t> values;
+    ...
+  };
+
+  struct Decimal128VectorBatch: public ColumnVectorBatch {
+    DataBuffer<Int128> values;
+    ...
+  };
+~~~
+
+StringVectorBatch handles all of the binary types (binary,
+char, string, and varchar). The data is represented as a char* buffer,
+and a length buffer.
+
+~~~ cpp
+  struct StringVectorBatch: public ColumnVectorBatch {
+    DataBuffer<char*> data;
+    DataBuffer<int64_t> length;
+    ...
+  };
+~~~
+
+StructVectorBatch handles the struct columns and represents
+the data as a buffer of `ColumnVectorBatch`.
+
+~~~ cpp
+  struct StructVectorBatch: public ColumnVectorBatch {
+    std::vector<ColumnVectorBatch*> fields;
+    ...
+  };
+~~~
+
+UnionVectorBatch handles the union columns. It uses `tags`
+to indicate which subtype has the value and `offsets` indicates
+the offset in child batch of that subtype. A individual
+`ColumnVectorBatch` is used for each subtype.
+
+~~~ cpp
+  struct UnionVectorBatch: public ColumnVectorBatch {
+    DataBuffer<unsigned char> tags;
+    DataBuffer<uint64_t> offsets;
+    std::vector<ColumnVectorBatch*> children;
+    ...
+  };
+~~~
+
+ListVectorBatch handles the array columns and represents
+the data as a buffer of integers for the offsets and a
+`ColumnVectorBatch` for the children values.
+
+~~~ cpp
+  struct ListVectorBatch: public ColumnVectorBatch {
+    /**
+     * The offset of the first element of each list.
+     * The length of list i is startOffset[i+1] - startOffset[i].
+     */
+    DataBuffer<int64_t> offsets;
+    // the concatenated elements
+    ORC_UNIQUE_PTR<ColumnVectorBatch> elements;
+    ...
+  };
+~~~
+
+MapVectorBatch handles the map columns and represents the data
+as two arrays of integers for the offsets and two `ColumnVectorBatch`s
+for the keys and values.
+
+~~~ cpp
+  struct MapVectorBatch: public ColumnVectorBatch {
+    /**
+     * The offset of the first element of each list.
+     * The length of list i is startOffset[i+1] - startOffset[i].
+     */
+    DataBuffer<int64_t> offsets;
+    ORC_UNIQUE_PTR<ColumnVectorBatch> keys;
+    ORC_UNIQUE_PTR<ColumnVectorBatch> elements;
+    ...
+  };
+~~~
+
+## Writing ORC Files
+
+To write an ORC file, you need to include `OrcFile.hh` and define
+the schema; then use `orc::OutputStream` and `orc::WriterOptions`
+to create a `orc::Writer` with the desired filename. This example
+sets the required schema parameter, but there are many other
+options to control the ORC writer.
+
+~~~ cpp
+ORC_UNIQUE_PTR<OutputStream> outStream =
+  writeLocalFile("my-file.orc");
+ORC_UNIQUE_PTR<Type> schema(
+  Type::buildTypeFromString("struct<x:int,y:int>"));
+WriterOptions options;
+ORC_UNIQUE_PTR<Writer> writer =
+  createWriter(*schema, outStream.get(), options);
+~~~
+
+Now you need to create a row batch, set the data, and write it to the file
+as the batch fills up. When the file is done, close the `Writer`.
+
+~~~ cpp
+uint64_t batchSize = 1024, rowCount = 10000;
+ORC_UNIQUE_PTR<ColumnVectorBatch> batch =
+  writer->createRowBatch(batchSize);
+StructVectorBatch *root =
+  dynamic_cast<StructVectorBatch *>(batch.get());
+LongVectorBatch *x =
+  dynamic_cast<LongVectorBatch *>(root->fields[0]);
+LongVectorBatch *y =
+  dynamic_cast<LongVectorBatch *>(root->fields[1]);
+
+uint64_t rows = 0;
+for (uint64_t i = 0; i < rowCount; ++i) {
+  x->data[rows] = i;
+  y->data[rows] = i * 3;
+  rows++;
+
+  if (rows == batchSize) {
+    root->numElements = rows;
+    x->numElements = rows;
+    y->numElements = rows;
+
+    writer->add(*batch);
+    rows = 0;
+  }
+}
+
+if (rows != 0) {
+  root->numElements = rows;
+  x->numElements = rows;
+  y->numElements = rows;
+
+  writer->add(*batch);
+  rows = 0;
+}
+
+writer->close();
+~~~
+
+## Reading ORC Files
+
+To read ORC files, include `OrcFile.hh` file to create a `orc::Reader`
+that contains the metadata about the file. There are a few options to
+the `orc::Reader`, but far fewer than the writer and none of them are
+required. The reader has methods for getting the number of rows,
+schema, compression, etc. from the file.
+
+~~~ cpp
+ORC_UNIQUE_PTR<InputStream> inStream =
+  readLocalFile("my-file.orc");
+ReaderOptions options;
+ORC_UNIQUE_PTR<Reader> reader =
+  createReader(inStream, options);
+~~~
+
+To get the data, create a `orc::RowReader` object. By default,
+the RowReader reads all rows and all columns, but there are
+options to control the data that is read.
+
+~~~ cpp
+RowReaderOptions rowReaderOptions;
+ORC_UNIQUE_PTR<RowReader> rowReader =
+  reader->createRowReader(rowReaderOptions);
+ORC_UNIQUE_PTR<ColumnVectorBatch> batch =
+  rowReader->createRowBatch(1024);
+~~~
+
+With a `orc::RowReader` the user can ask for the next batch until there
+are no more left. The reader will stop the batch at certain boundaries,
+so the returned batch may not be full, but it will always contain some rows.
+
+~~~ cpp
+while (rowReader->next(*batch)) {
+  for (uint64_t r = 0; r < batch->numElements; ++r) {
+    ... process row r from batch
+  }
+}
+~~~
diff --git a/site/_docs/cpp-tools.md b/site/_docs/cpp-tools.md
new file mode 100644
index 0000000000..d4d6e7570f
--- /dev/null
+++ b/site/_docs/cpp-tools.md
@@ -0,0 +1,268 @@
+---
+layout: docs
+title: C++ Tools
+permalink: /docs/cpp-tools.html
+---
+
+## orc-contents
+
+Displays the contents of the ORC file as a JSON document. With the
+`columns` argument only the selected columns are printed.
+
+~~~ shell
+% orc-contents  [--columns=1,2,...] <filename>
+~~~
+
+If you run it on the example file TestOrcFile.test1.orc, you'll see (without
+the line breaks within each record):
+
+~~~ shell
+% orc-contents examples/TestOrcFile.test1.orc
+{"boolean1": false, "byte1": 1, "short1": 1024, "int1": 65536, \\
+ "long1": 9223372036854775807, "float1": 1, "double1": -15, \\
+ "bytes1": [0, 1, 2, 3, 4], "string1": "hi", "middle": \\
+    {"list": [{"int1": 1, "string1": "bye"}, \\
+              {"int1": 2, "string1": "sigh"}]}, \\
+ "list": [{"int1": 3, "string1": "good"}, \\
+          {"int1": 4, "string1": "bad"}], \\
+ "map": []}
+{"boolean1": true, "byte1": 100, "short1": 2048, "int1": 65536,
+ "long1": 9223372036854775807, "float1": 2, "double1": -5, \\
+ "bytes1": [], "string1": "bye", \\
+ "middle": {"list": [{"int1": 1, "string1": "bye"}, \\
+                     {"int1": 2, "string1": "sigh"}]}, \\
+ "list": [{"int1": 100000000, "string1": "cat"}, \\
+          {"int1": -100000, "string1": "in"}, \\
+          {"int1": 1234, "string1": "hat"}], \\
+ "map": [{"key": "chani", "value": {"int1": 5, "string1": "chani"}}, \\
+         {"key": "mauddib", \\
+          "value": {"int1": 1, "string1": "mauddib"}}]}
+~~~
+
+## orc-metadata
+
+Displays the metadata of the ORC file as a JSON document. With the
+`verbose` option additional information about the layout of the file
+is also printed.
+
+For diagnosing problems, it is useful to use the '--raw' option that
+prints the protocol buffers from the ORC file directly rather than
+interpreting them.
+
+~~~ shell
+% orc-metadata [-v] [--raw] <filename>
+~~~
+
+If you run it on the example file TestOrcFile.test1.orc, you'll see:
+
+~~~ shell
+% orc-metadata examples/TestOrcFile.test1.orc
+{ "name": "../examples/TestOrcFile.test1.orc",
+  "type": "struct<boolean1:boolean,byte1:tinyint,short1:smallint,
+int1:int,long1:bigint,float1:float,double1:double,bytes1:binary,
+string1:string,middle:struct<list:array<struct<int1:int,string1:
+string>>>,list:array<struct<int1:int,string1:string>>,map:map<
+string,struct<int1:int,string1:string>>>",
+  "rows": 2,
+  "stripe count": 1,
+  "format": "0.12", "writer version": "HIVE-8732",
+  "compression": "zlib", "compression block": 10000,
+  "file length": 1711,
+  "content": 1015, "stripe stats": 250, "footer": 421, "postscript": 24,
+  "row index stride": 10000,
+  "user metadata": {
+  },
+  "stripes": [
+    { "stripe": 0, "rows": 2,
+      "offset": 3, "length": 1012,
+      "index": 570, "data": 243, "footer": 199
+    }
+  ]
+}
+~~~
+
+## csv-import
+
+Imports CSV file into an Orc file using the specified schema.
+Compound types are not yet supported. `delimiter` option indicates
+the delimiter in the input CSV file and by default is `,`. `stripe`
+option means the stripe size and set to 128MB by default. `block`
+option is compression block size which is 64KB by default. `batch`
+option is by default 1024 rows for one batch.
+
+~~~ shell
+% csv-import [--delimiter=<character>] [--stripe=<size>]
+             [--block=<size>] [--batch=<size>]
+             <schema> <inputCSVFile> <outputORCFile>
+~~~
+
+If you run it on the example file TestCSVFileImport.test10rows.csv,
+you'll see:
+
+~~~ shell
+% csv-import "struct<a:bigint,b:string,c:double>"
+             examples/TestCSVFileImport.test10rows.csv /tmp/test.orc
+[2018-04-11 11:12:16] Start importing Orc file...
+[2018-04-11 11:12:16] Finish importing Orc file.
+[2018-04-11 11:12:16] Total writer elasped time: 0.001352s.
+[2018-04-11 11:12:16] Total writer CPU time: 0.001339s.
+~~~
+
+## orc-scan
+
+Scans and displays the row count of the ORC file. With the `batch` option
+to set the batch size which is 1024 rows by default. It is useful to check
+if the ORC file is damaged.
+
+~~~ shell
+% orc-scan [--batch=<size>] <filename>
+~~~
+
+If you run it on the example file TestOrcFile.test1.orc, you'll see:
+
+~~~ shell
+% orc-scan examples/TestOrcFile.test1.orc
+Rows: 2
+Batches: 1
+~~~
+
+## orc-statistics
+
+Displays the file-level and stripe-level column statistics of the ORC file.
+With the `withIndex` option to include column statistics in each row group.
+
+~~~ shell
+% orc-statistics [--withIndex] <filename>
+~~~
+
+If you run it on the example file TestOrcFile.TestOrcFile.columnProjection.orc
+you'll see:
+
+~~~ shell
+% orc-statistics examples/TestOrcFile.columnProjection.orc
+File examples/TestOrcFile.columnProjection.orc has 3 columns
+*** Column 0 ***
+Column has 21000 values and has null value: no
+
+*** Column 1 ***
+Data type: Integer
+Values: 21000
+Has null: no
+Minimum: -2147439072
+Maximum: 2147257982
+Sum: 268482658568
+
+*** Column 2 ***
+Data type: String
+Values: 21000
+Has null: no
+Minimum: 100119c272d7db89
+Maximum: fffe9f6f23b287f3
+Total length: 334559
+
+File examples/TestOrcFile.columnProjection.orc has 5 stripes
+*** Stripe 0 ***
+
+--- Column 0 ---
+Column has 5000 values and has null value: no
+
+--- Column 1 ---
+Data type: Integer
+Values: 5000
+Has null: no
+Minimum: -2145365268
+Maximum: 2147025027
+Sum: -29841423854
+
+--- Column 2 ---
+Data type: String
+Values: 5000
+Has null: no
+Minimum: 1005350489418be2
+Maximum: fffbb8718c92b09f
+Total length: 79644
+
+*** Stripe 1 ***
+
+--- Column 0 ---
+Column has 5000 values and has null value: no
+
+--- Column 1 ---
+Data type: Integer
+Values: 5000
+Has null: no
+Minimum: -2147115959
+Maximum: 2147257982
+Sum: 108604887785
+
+--- Column 2 ---
+Data type: String
+Values: 5000
+Has null: no
+Minimum: 100119c272d7db89
+Maximum: fff0ae41d41e6afc
+Total length: 79640
+
+*** Stripe 2 ***
+
+--- Column 0 ---
+Column has 5000 values and has null value: no
+
+--- Column 1 ---
+Data type: Integer
+Values: 5000
+Has null: no
+Minimum: -2145932387
+Maximum: 2145877119
+Sum: 70064190848
+
+--- Column 2 ---
+Data type: String
+Values: 5000
+Has null: no
+Minimum: 10130af874ae036c
+Maximum: fffe9f6f23b287f3
+Total length: 79645
+
+*** Stripe 3 ***
+
+--- Column 0 ---
+Column has 5000 values and has null value: no
+
+--- Column 1 ---
+Data type: Integer
+Values: 5000
+Has null: no
+Minimum: -2147439072
+Maximum: 2147074354
+Sum: 104681356482
+
+--- Column 2 ---
+Data type: String
+Values: 5000
+Has null: no
+Minimum: 102547d48ed06518
+Maximum: fffa47c57dc7b69a
+Total length: 79689
+
+*** Stripe 4 ***
+
+--- Column 0 ---
+Column has 1000 values and has null value: no
+
+--- Column 1 ---
+Data type: Integer
+Values: 1000
+Has null: no
+Minimum: -2141222223
+Maximum: 2145816096
+Sum: 14973647307
+
+--- Column 2 ---
+Data type: String
+Values: 1000
+Has null: no
+Minimum: 1059d81c9025a217
+Maximum: ffc17f0e35e1a6c0
+Total length: 15941
+~~~
\ No newline at end of file
diff --git a/site/_docs/tools.md b/site/_docs/java-tools.md
similarity index 76%
rename from site/_docs/tools.md
rename to site/_docs/java-tools.md
index 04ff4fdbae..38559553ed 100644
--- a/site/_docs/tools.md
+++ b/site/_docs/java-tools.md
@@ -1,90 +1,11 @@
 ---
 layout: docs
-title: Tools
-permalink: /docs/tools.html
+title: Java Tools
+permalink: /docs/java-tools.html
 ---
 
-## orc-contents
-
-Displays the contents of the ORC file as a JSON document. With the
-`columns` argument only the selected columns are printed.
-
-~~~ shell
-% orc-contents  [--columns=1,2,...] <filename>
-~~~
-
-If you run it on the example file TestOrcFile.test1.orc, you'll see (without
-the line breaks within each record):
-
-~~~ shell
-% orc-contents examples/TestOrcFile.test1.orc
-{"boolean1": false, "byte1": 1, "short1": 1024, "int1": 65536, \\
- "long1": 9223372036854775807, "float1": 1, "double1": -15, \\
- "bytes1": [0, 1, 2, 3, 4], "string1": "hi", "middle": \\
-    {"list": [{"int1": 1, "string1": "bye"}, \\
-              {"int1": 2, "string1": "sigh"}]}, \\
- "list": [{"int1": 3, "string1": "good"}, \\
-          {"int1": 4, "string1": "bad"}], \\
- "map": []}
-{"boolean1": true, "byte1": 100, "short1": 2048, "int1": 65536,
- "long1": 9223372036854775807, "float1": 2, "double1": -5, \\
- "bytes1": [], "string1": "bye", \\
- "middle": {"list": [{"int1": 1, "string1": "bye"}, \\
-                     {"int1": 2, "string1": "sigh"}]}, \\
- "list": [{"int1": 100000000, "string1": "cat"}, \\
-          {"int1": -100000, "string1": "in"}, \\
-          {"int1": 1234, "string1": "hat"}], \\
- "map": [{"key": "chani", "value": {"int1": 5, "string1": "chani"}}, \\
-         {"key": "mauddib", \\
-          "value": {"int1": 1, "string1": "mauddib"}}]}
-~~~
-
-## orc-metadata
-
-Displays the metadata of the ORC file as a JSON document. With the
-`verbose` option additional information about the layout of the file
-is also printed.
-
-For diagnosing problems, it is useful to use the '--raw' option that
-prints the protocol buffers from the ORC file directly rather than
-interpreting them.
-
-~~~ shell
-% orc-metadata [-v] [--raw] <filename>
-~~~
-
-If you run it on the example file TestOrcFile.test1.orc, you'll see:
-
-~~~ shell
-% orc-metadata examples/TestOrcFile.test1.orc
-{ "name": "../examples/TestOrcFile.test1.orc",
-  "type": "struct<boolean1:boolean,byte1:tinyint,short1:smallint,
-int1:int,long1:bigint,float1:float,double1:double,bytes1:binary,
-string1:string,middle:struct<list:array<struct<int1:int,string1:
-string>>>,list:array<struct<int1:int,string1:string>>,map:map<
-string,struct<int1:int,string1:string>>>",
-  "rows": 2,
-  "stripe count": 1,
-  "format": "0.12", "writer version": "HIVE-8732",
-  "compression": "zlib", "compression block": 10000,
-  "file length": 1711,
-  "content": 1015, "stripe stats": 250, "footer": 421, "postscript": 24,
-  "row index stride": 10000,
-  "user metadata": {
-  },
-  "stripes": [
-    { "stripe": 0, "rows": 2,
-      "offset": 3, "length": 1012,
-      "index": 570, "data": 243, "footer": 199
-    }
-  ]
-}
-~~~
-
-## Java ORC Tools
-
-In addition to the C++ tools above, there is an ORC tools jar that
-packages several useful utilities and the necessary Java dependencies
+In addition to the C++ tools, there is an ORC tools jar that packages
+several useful utilities and the necessary Java dependencies
 (including Hadoop) into a single package. The Java ORC tool jar
 supports both the local file system and HDFS.
 
@@ -102,7 +23,7 @@ The command line looks like:
 % java -jar orc-tools-X.Y.Z-uber.jar <sub-command> <args>
 ~~~
 
-### Java Meta
+## Java Meta
 
 The meta command prints the metadata about the given ORC file and is
 equivalent to the Hive ORC File Dump command.
@@ -280,19 +201,19 @@ Padding ratio: 0%
 ______________________________________________________________________
 ~~~
 
-### Java Data
+## Java Data
 
 The data command prints the data in an ORC file as a JSON document. Each
 record is printed as a JSON object on a line. Each record is annotated with
 the fieldnames and a JSON representation that depends on the field's type.
 
-### Java Scan
+## Java Scan
 
 The scan command reads the contents of the file without printing anything. It
 is primarily intendend for benchmarking the Java reader without including the
 cost of printing the data out.
 
-### Java Convert
+## Java Convert
 
 The convert command reads several JSON files and converts them into a
 single ORC file.
@@ -309,7 +230,7 @@ single ORC file.
 The automatic JSON schema discovery is equivalent to the json-schema tool
 below.
 
-### Java JSON Schema
+## Java JSON Schema
 
 The JSON Schema discovery tool processes a set of JSON documents and
 produces a schema that encompasses all of the records in all of the

From 69dbdcb23896e1b20ca6037071102c25af22600a Mon Sep 17 00:00:00 2001
From: Gang Wu <gang.w@alibaba-inc.com>
Date: Wed, 11 Apr 2018 13:43:08 -0700
Subject: [PATCH 2/2] remove comments in code block

---
 site/_docs/core-cpp.md | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/site/_docs/core-cpp.md b/site/_docs/core-cpp.md
index 038d0462c2..4b9f683252 100644
--- a/site/_docs/core-cpp.md
+++ b/site/_docs/core-cpp.md
@@ -22,11 +22,8 @@ value is null.
 ~~~ cpp
 namespace orc {
   struct ColumnVectorBatch {
-    // the number of current occupied slots
     uint64_t numElements;
-    // an array of capacity length marking non-null values
     DataBuffer<char> notNull;
-    // whether there are any null values
     bool hasNulls;
     ...
   }
@@ -149,12 +146,7 @@ the data as a buffer of integers for the offsets and a
 
 ~~~ cpp
   struct ListVectorBatch: public ColumnVectorBatch {
-    /**
-     * The offset of the first element of each list.
-     * The length of list i is startOffset[i+1] - startOffset[i].
-     */
     DataBuffer<int64_t> offsets;
-    // the concatenated elements
     ORC_UNIQUE_PTR<ColumnVectorBatch> elements;
     ...
   };
@@ -166,10 +158,6 @@ for the keys and values.
 
 ~~~ cpp
   struct MapVectorBatch: public ColumnVectorBatch {
-    /**
-     * The offset of the first element of each list.
-     * The length of list i is startOffset[i+1] - startOffset[i].
-     */
     DataBuffer<int64_t> offsets;
     ORC_UNIQUE_PTR<ColumnVectorBatch> keys;
     ORC_UNIQUE_PTR<ColumnVectorBatch> elements;