src/parquet/arrow/reader.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#ifndef PARQUET_ARROW_READER_H
#define PARQUET_ARROW_READER_H

#include <memory>
#include <vector>

#include "parquet/api/reader.h"
#include "parquet/api/schema.h"

#include "arrow/io/interfaces.h"

namespace arrow {

class Array;
class MemoryPool;
class RecordBatchReader;
class Status;
class Table;
}  // namespace arrow

namespace parquet {

namespace arrow {

class ColumnChunkReader;
class ColumnReader;
class RowGroupReader;

// Arrow read adapter class for deserializing Parquet files as Arrow row
// batches.
//
// This interfaces caters for different use cases and thus provides different
// interfaces. In its most simplistic form, we cater for a user that wants to
// read the whole Parquet at once with the FileReader::ReadTable method.
//
// More advanced users that also want to implement parallelism on top of each
// single Parquet files should do this on the RowGroup level. For this, they can
// call FileReader::RowGroup(i)->ReadTable to receive only the specified
// RowGroup as a table.
//
// In the most advanced situation, where a consumer wants to independently read
// RowGroups in parallel and consume each column individually, they can call
// FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column
// instance.
//
// TODO(wesm): nested data does not always make sense with this user
// interface unless you are only reading a single leaf node from a branch of
// a table. For example:
//
// repeated group data {
//   optional group record {
//     optional int32 val1;
//     optional byte_array val2;
//     optional bool val3;
//   }
//   optional int32 val4;
// }
//
// In the Parquet file, there are 3 leaf nodes:
//
// * data.record.val1
// * data.record.val2
// * data.record.val3
// * data.val4
//
// When materializing this data in an Arrow array, we would have:
//
// data: list<struct<
//   record: struct<
//    val1: int32,
//    val2: string (= list<uint8>),
//    val3: bool,
//   >,
//   val4: int32
// >>
//
// However, in the Parquet format, each leaf node has its own repetition and
// definition levels describing the structure of the intermediate nodes in
// this array structure. Thus, we will need to scan the leaf data for a group
// of leaf nodes part of the same type tree to create a single result Arrow
// nested array structure.
//
// This is additionally complicated "chunky" repeated fields or very large byte
// arrays
class PARQUET_EXPORT FileReader {
 public:
  FileReader(::arrow::MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader);

  // Since the distribution of columns amongst a Parquet file's row groups may
  // be uneven (the number of values in each column chunk can be different), we
  // provide a column-oriented read interface. The ColumnReader hides the
  // details of paging through the file's row groups and yielding
  // fully-materialized arrow::Array instances
  //
  // Returns error status if the column of interest is not flat.
  ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out);

  /// \brief Return arrow schema by apply selection of column indices.
  /// \returns error status if passed wrong indices.
  ::arrow::Status GetSchema(const std::vector<int>& indices,
                            std::shared_ptr<::arrow::Schema>* out);

  // Read column as a whole into an Array.
  ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out);

  // NOTE: Experimental API
  // Reads a specific top level schema field into an Array
  // The index i refers the index of the top level schema field, which may
  // be nested or flat - e.g.
  //
  // 0 foo.bar
  //   foo.bar.baz
  //   foo.qux
  // 1 foo2
  // 2 foo3
  //
  // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
  ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out);

  // NOTE: Experimental API
  // Reads a specific top level schema field into an Array, while keeping only chosen
  // leaf columns.
  // The index i refers the index of the top level schema field, which may
  // be nested or flat, and indices vector refers to the leaf column indices - e.g.
  //
  // i  indices
  // 0  0        foo.bar
  // 0  1        foo.bar.baz
  // 0  2        foo.qux
  // 1  3        foo2
  // 2  4        foo3
  //
  // i=0 indices={0,2} will read a partial struct with foo.bar and foo.quox columns
  // i=1 indices={3} will read foo2 column
  // i=1 indices={2} will result in out=nullptr
  // leaf indices which are unrelated to the schema field are ignored
  ::arrow::Status ReadSchemaField(int i, const std::vector<int>& indices,
                                  std::shared_ptr<::arrow::Array>* out);

  /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the
  ///    ordering in row_group_indices matters.
  /// \returns error Status if row_group_indices contains invalid index
  ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                                       std::shared_ptr<::arrow::RecordBatchReader>* out);

  /// \brief Return a RecordBatchReader of row groups selected from row_group_indices,
  ///     whose columns are selected by column_indices. The ordering in row_group_indices
  ///     and column_indices matter.
  /// \returns error Status if either row_group_indices or column_indices contains invalid
  ///    index
  ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                                       const std::vector<int>& column_indices,
                                       std::shared_ptr<::arrow::RecordBatchReader>* out);

  // Read a table of columns into a Table
  ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out);

  // Read a table of columns into a Table. Read only the indicated column
  // indices (relative to the schema)
  ::arrow::Status ReadTable(const std::vector<int>& column_indices,
                            std::shared_ptr<::arrow::Table>* out);

  ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
                               std::shared_ptr<::arrow::Table>* out);

  ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out);

  ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
                                const std::vector<int>& column_indices,
                                std::shared_ptr<::arrow::Table>* out);

  ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
                                std::shared_ptr<::arrow::Table>* out);

  /// \brief Scan file contents with one thread, return number of rows
  ::arrow::Status ScanContents(std::vector<int> columns, const int32_t column_batch_size,
                               int64_t* num_rows);

  /// \brief Return a reader for the RowGroup, this object must not outlive the
  ///   FileReader.
  std::shared_ptr<RowGroupReader> RowGroup(int row_group_index);

  int num_row_groups() const;

  const ParquetFileReader* parquet_reader() const;

  /// Set the number of threads to use during reads of multiple columns. By
  /// default only 1 thread is used
  /// \deprecated Use set_use_threads instead.
  void set_num_threads(int num_threads);

  /// Set whether to use multiple threads during reads of multiple columns.
  /// By default only one thread is used.
  void set_use_threads(bool use_threads);

  virtual ~FileReader();

 private:
  friend ColumnChunkReader;
  friend RowGroupReader;

  class PARQUET_NO_EXPORT Impl;
  std::unique_ptr<Impl> impl_;
};

class PARQUET_EXPORT RowGroupReader {
 public:
  std::shared_ptr<ColumnChunkReader> Column(int column_index);

  ::arrow::Status ReadTable(const std::vector<int>& column_indices,
                            std::shared_ptr<::arrow::Table>* out);
  ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out);

  virtual ~RowGroupReader();

 private:
  friend FileReader;
  RowGroupReader(FileReader::Impl* reader, int row_group_index);

  FileReader::Impl* impl_;
  int row_group_index_;
};

class PARQUET_EXPORT ColumnChunkReader {
 public:
  ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out);

  virtual ~ColumnChunkReader();

 private:
  friend RowGroupReader;
  ColumnChunkReader(FileReader::Impl* impl, int row_group_index, int column_index);

  FileReader::Impl* impl_;
  int column_index_;
  int row_group_index_;
};

// At this point, the column reader is a stream iterator. It only knows how to
// read the next batch of values for a particular column from the file until it
// runs out.
//
// We also do not expose any internal Parquet details, such as row groups. This
// might change in the future.
class PARQUET_EXPORT ColumnReader {
 public:
  class PARQUET_NO_EXPORT ColumnReaderImpl;
  virtual ~ColumnReader();

  // Scan the next array of the indicated size. The actual size of the
  // returned array may be less than the passed size depending how much data is
  // available in the file.
  //
  // When all the data in the file has been exhausted, the result is set to
  // nullptr.
  //
  // Returns Status::OK on a successful read, including if you have exhausted
  // the data available in the file.
  ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out);

 private:
  std::unique_ptr<ColumnReaderImpl> impl_;
  explicit ColumnReader(std::unique_ptr<ColumnReaderImpl> impl);

  friend class FileReader;
  friend class PrimitiveImpl;
  friend class StructImpl;
};

// Helper function to create a file reader from an implementation of an Arrow
// readable file
//
// metadata : separately-computed file metadata, can be nullptr
PARQUET_EXPORT
::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file,
                         ::arrow::MemoryPool* allocator,
                         const ReaderProperties& properties,
                         const std::shared_ptr<FileMetaData>& metadata,
                         std::unique_ptr<FileReader>* reader);

PARQUET_EXPORT
::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file,
                         ::arrow::MemoryPool* allocator,
                         std::unique_ptr<FileReader>* reader);

}  // namespace arrow
}  // namespace parquet

#endif  // PARQUET_ARROW_READER_H