diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 983ff8d8a9a4..1aa21244114e 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -31,16 +31,11 @@ from pyarrow.lib cimport (Array, Schema, NativeFile, get_reader, get_writer) from pyarrow.compat import tobytes, frombytes +from pyarrow.formatting import indent from pyarrow.lib import ArrowException, NativeFile, _stringify_path import six - -try: - from textwrap import indent -except ImportError: - def indent(text, prefix): - lines = [prefix + line for line in text.splitlines(True)] - return ''.join(lines) +import warnings cdef class RowGroupStatistics: @@ -95,49 +90,42 @@ cdef class RowGroupStatistics: else: raise ValueError('Unknown physical ParquetType') - property has_min_max: - - def __get__(self): - return self.statistics.get().HasMinMax() - - property min: - - def __get__(self): - raw_physical_type = self.statistics.get().physical_type() - encode_min = self.statistics.get().EncodeMin() - - min_value = FormatStatValue(raw_physical_type, encode_min.c_str()) - return self._cast_statistic(min_value) - - property max: - - def __get__(self): - raw_physical_type = self.statistics.get().physical_type() - encode_max = self.statistics.get().EncodeMax() - - max_value = FormatStatValue(raw_physical_type, encode_max.c_str()) - return self._cast_statistic(max_value) + @property + def has_min_max(self): + return self.statistics.get().HasMinMax() - property null_count: + @property + def min(self): + raw_physical_type = self.statistics.get().physical_type() + encode_min = self.statistics.get().EncodeMin() - def __get__(self): - return self.statistics.get().null_count() + min_value = FormatStatValue(raw_physical_type, encode_min.c_str()) + return self._cast_statistic(min_value) - property distinct_count: + @property + def max(self): + raw_physical_type = self.statistics.get().physical_type() + encode_max = self.statistics.get().EncodeMax() - def __get__(self): - return self.statistics.get().distinct_count() + max_value = FormatStatValue(raw_physical_type, encode_max.c_str()) + return self._cast_statistic(max_value) - property num_values: + @property + def null_count(self): + return self.statistics.get().null_count() - def __get__(self): - return self.statistics.get().num_values() + @property + def distinct_count(self): + return self.statistics.get().distinct_count() - property physical_type: + @property + def num_values(self): + return self.statistics.get().num_values() - def __get__(self): - physical_type = self.statistics.get().physical_type() - return physical_type_name_from_enum(physical_type) + @property + def physical_type(self): + raw_physical_type = self.statistics.get().physical_type() + return physical_type_name_from_enum(raw_physical_type) cdef class ColumnChunkMetaData: @@ -157,7 +145,7 @@ cdef class ColumnChunkMetaData: return """{0} file_offset: {1} file_path: {2} - type: {3} + physical_type: {3} num_values: {4} path_in_schema: {5} is_stats_set: {6} @@ -168,12 +156,11 @@ cdef class ColumnChunkMetaData: has_dictionary_page: {10} dictionary_page_offset: {11} data_page_offset: {12} - index_page_offset: {13} - total_compressed_size: {14} - total_uncompressed_size: {15}""".format(object.__repr__(self), + total_compressed_size: {13} + total_uncompressed_size: {14}""".format(object.__repr__(self), self.file_offset, self.file_path, - self.type, + self.physical_type, self.num_values, self.path_in_schema, self.is_stats_set, @@ -183,90 +170,80 @@ cdef class ColumnChunkMetaData: self.has_dictionary_page, self.dictionary_page_offset, self.data_page_offset, - self.index_page_offset, self.total_compressed_size, self.total_uncompressed_size) - property file_offset: - - def __get__(self): - return self.metadata.file_offset() - - property file_path: - - def __get__(self): - return frombytes(self.metadata.file_path()) - - property type: - - def __get__(self): - return physical_type_name_from_enum(self.metadata.type()) - - property num_values: - - def __get__(self): - return self.metadata.num_values() - - property path_in_schema: - - def __get__(self): - path = self.metadata.path_in_schema().get().ToDotString() - return frombytes(path) - - property is_stats_set: - - def __get__(self): - return self.metadata.is_stats_set() + @property + def file_offset(self): + return self.metadata.file_offset() - property statistics: + @property + def file_path(self): + return frombytes(self.metadata.file_path()) - def __get__(self): - if not self.metadata.is_stats_set(): - return None - statistics = RowGroupStatistics() - statistics.init(self.metadata.statistics()) - return statistics + @property + def physical_type(self): + return physical_type_name_from_enum(self.metadata.type()) - property compression: + @property + def num_values(self): + return self.metadata.num_values() - def __get__(self): - return self.metadata.compression() + @property + def path_in_schema(self): + path = self.metadata.path_in_schema().get().ToDotString() + return frombytes(path) - property encodings: + @property + def is_stats_set(self): + return self.metadata.is_stats_set() - def __get__(self): - return map(encoding_name_from_enum, - self.metadata.encodings()) + @property + def statistics(self): + if not self.metadata.is_stats_set(): + return None + statistics = RowGroupStatistics() + statistics.init(self.metadata.statistics()) + return statistics - property has_dictionary_page: + @property + def compression(self): + return compression_name_from_enum(self.metadata.compression()) - def __get__(self): - return self.metadata.has_dictionary_page() + @property + def encodings(self): + return tuple(map(encoding_name_from_enum, self.metadata.encodings())) - property dictionary_page_offset: + @property + def has_dictionary_page(self): + return bool(self.metadata.has_dictionary_page()) - def __get__(self): + @property + def dictionary_page_offset(self): + if self.has_dictionary_page: return self.metadata.dictionary_page_offset() + else: + return None - property data_page_offset: - - def __get__(self): - return self.metadata.data_page_offset() - - property index_page_offset: - - def __get__(self): - return self.metadata.index_page_offset() + @property + def data_page_offset(self): + return self.metadata.data_page_offset() - property total_compressed_size: + @property + def has_index_page(self): + raise NotImplementedError('not supported in parquet-cpp') - def __get__(self): - return self.metadata.total_compressed_size() + @property + def index_page_offset(self): + raise NotImplementedError("parquet-cpp doesn't return valid values") - property total_uncompressed_size: + @property + def total_compressed_size(self): + return self.metadata.total_compressed_size() - def __get__(self): - return self.metadata.total_uncompressed_size() + @property + def total_uncompressed_size(self): + return self.metadata.total_uncompressed_size() cdef class RowGroupMetaData: @@ -275,10 +252,7 @@ cdef class RowGroupMetaData: CRowGroupMetaData* metadata FileMetaData parent - def __cinit__(self): - pass - - cdef void init_from_file(self, FileMetaData parent, int i): + def __cinit__(self, FileMetaData parent, int i): if i < 0 or i >= parent.num_row_groups: raise IndexError('{0} out of bounds'.format(i)) self.up_metadata = parent._metadata.RowGroup(i) @@ -299,20 +273,17 @@ cdef class RowGroupMetaData: self.num_rows, self.total_byte_size) - property num_columns: - - def __get__(self): - return self.metadata.num_columns() - - property num_rows: - - def __get__(self): - return self.metadata.num_rows() + @property + def num_columns(self): + return self.metadata.num_columns() - property total_byte_size: + @property + def num_rows(self): + return self.metadata.num_rows() - def __get__(self): - return self.metadata.total_byte_size() + @property + def total_byte_size(self): + return self.metadata.total_byte_size() cdef class FileMetaData: @@ -343,72 +314,56 @@ cdef class FileMetaData: @property def schema(self): - if self._schema is not None: - return self._schema - - cdef ParquetSchema schema = ParquetSchema() - schema.init_from_filemeta(self) - self._schema = schema - return schema - - property serialized_size: - - def __get__(self): - return self._metadata.size() - - property num_columns: - - def __get__(self): - return self._metadata.num_columns() + if self._schema is None: + self._schema = ParquetSchema(self) + return self._schema - property num_rows: - - def __get__(self): - return self._metadata.num_rows() + @property + def serialized_size(self): + return self._metadata.size() - property num_row_groups: + @property + def num_columns(self): + return self._metadata.num_columns() - def __get__(self): - return self._metadata.num_row_groups() + @property + def num_rows(self): + return self._metadata.num_rows() - property format_version: + @property + def num_row_groups(self): + return self._metadata.num_row_groups() - def __get__(self): - cdef ParquetVersion version = self._metadata.version() - if version == ParquetVersion_V1: - return '1.0' - if version == ParquetVersion_V2: - return '2.0' - else: - print('Unrecognized file version, assuming 1.0: {0}' - .format(version)) - return '1.0' + @property + def format_version(self): + cdef ParquetVersion version = self._metadata.version() + if version == ParquetVersion_V1: + return '1.0' + if version == ParquetVersion_V2: + return '2.0' + else: + warnings.warn('Unrecognized file version, assuming 1.0: {}' + .format(version)) + return '1.0' - property created_by: + @property + def created_by(self): + return frombytes(self._metadata.created_by()) - def __get__(self): - return frombytes(self._metadata.created_by()) + @property + def metadata(self): + cdef: + unordered_map[c_string, c_string] metadata + const CKeyValueMetadata* underlying_metadata + underlying_metadata = self._metadata.key_value_metadata().get() + if underlying_metadata != NULL: + underlying_metadata.ToUnorderedMap(&metadata) + return metadata + else: + return None def row_group(self, int i): - """ - - """ - cdef RowGroupMetaData result = RowGroupMetaData() - result.init_from_file(self, i) - return result - - property metadata: - - def __get__(self): - cdef: - unordered_map[c_string, c_string] metadata - const CKeyValueMetadata* underlying_metadata - underlying_metadata = self._metadata.key_value_metadata().get() - if underlying_metadata != NULL: - underlying_metadata.ToUnorderedMap(&metadata) - return metadata - else: - return None + return RowGroupMetaData(self, i) cdef class ParquetSchema: @@ -416,8 +371,9 @@ cdef class ParquetSchema: FileMetaData parent # the FileMetaData owning the SchemaDescriptor const SchemaDescriptor* schema - def __cinit__(self): - self.schema = NULL + def __cinit__(self, FileMetaData container): + self.parent = container + self.schema = container._metadata.schema() def __repr__(self): cdef const ColumnDescriptor* descr @@ -434,20 +390,15 @@ cdef class ParquetSchema: {1} """.format(object.__repr__(self), '\n'.join(elements)) - cdef init_from_filemeta(self, FileMetaData container): - self.parent = container - self.schema = container._metadata.schema() - def __len__(self): return self.schema.num_columns() def __getitem__(self, i): return self.column(i) - property names: - - def __get__(self): - return [self[i].name for i in range(len(self))] + @property + def names(self): + return [self[i].name for i in range(len(self))] def to_arrow_schema(self): """ @@ -457,8 +408,7 @@ cdef class ParquetSchema: ------- schema : pyarrow.Schema """ - cdef: - shared_ptr[CSchema] sp_arrow_schema + cdef shared_ptr[CSchema] sp_arrow_schema with nogil: check_status(FromParquetSchema( @@ -467,6 +417,12 @@ cdef class ParquetSchema: return pyarrow_wrap_schema(sp_arrow_schema) + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + def equals(self, ParquetSchema other): """ Returns True if the Parquet schemas are equal @@ -477,9 +433,7 @@ cdef class ParquetSchema: if i < 0 or i >= len(self): raise IndexError('{0} out of bounds'.format(i)) - cdef ColumnSchema col = ColumnSchema() - col.init_from_schema(self, i) - return col + return ColumnSchema(self, i) cdef class ColumnSchema: @@ -487,13 +441,16 @@ cdef class ColumnSchema: ParquetSchema parent const ColumnDescriptor* descr - def __cinit__(self): - self.descr = NULL - - cdef init_from_schema(self, ParquetSchema schema, int i): + def __cinit__(self, ParquetSchema schema, int i): self.parent = schema self.descr = schema.schema.Column(i) + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + def equals(self, ColumnSchema other): """ Returns True if the column schemas are equal @@ -520,52 +477,43 @@ cdef class ColumnSchema: self.max_repetition_level, physical_type, logical_type) - property name: - - def __get__(self): - return frombytes(self.descr.name()) - - property path: - - def __get__(self): - return frombytes(self.descr.path().get().ToDotString()) - - property max_definition_level: - - def __get__(self): - return self.descr.max_definition_level() - - property max_repetition_level: + @property + def name(self): + return frombytes(self.descr.name()) - def __get__(self): - return self.descr.max_repetition_level() + @property + def path(self): + return frombytes(self.descr.path().get().ToDotString()) - property physical_type: + @property + def max_definition_level(self): + return self.descr.max_definition_level() - def __get__(self): - return physical_type_name_from_enum(self.descr.physical_type()) + @property + def max_repetition_level(self): + return self.descr.max_repetition_level() - property logical_type: + @property + def physical_type(self): + return physical_type_name_from_enum(self.descr.physical_type()) - def __get__(self): - return logical_type_name_from_enum(self.descr.logical_type()) + @property + def logical_type(self): + return logical_type_name_from_enum(self.descr.logical_type()) # FIXED_LEN_BYTE_ARRAY attribute - property length: - - def __get__(self): - return self.descr.type_length() + @property + def length(self): + return self.descr.type_length() # Decimal attributes - property precision: - - def __get__(self): - return self.descr.type_precision() - - property scale: + @property + def precision(self): + return self.descr.type_precision() - def __get__(self): - return self.descr.type_scale() + @property + def scale(self): + return self.descr.type_scale() cdef physical_type_name_from_enum(ParquetType type_): @@ -609,19 +557,56 @@ cdef logical_type_name_from_enum(ParquetLogicalType type_): }.get(type_, 'UNKNOWN') -cdef encoding_name_from_enum (ParquetEncoding encoding_): +cdef encoding_name_from_enum(ParquetEncoding encoding_): return { - ParquetEncoding_PLAIN: "PLAIN", - ParquetEncoding_PLAIN_DICTIONARY: "PLAIN_DICTIONARY", - ParquetEncoding_RLE: "RLE", - ParquetEncoding_BIT_PACKED: "BIT_PACKED", - ParquetEncoding_DELTA_BINARY_PACKED: "DELTA_BINARY_PACKED", - ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: "DELTA_LENGTH_BYTE_ARRAY", - ParquetEncoding_DELTA_BYTE_ARRAY: "DELTA_BYTE_ARRAY", - ParquetEncoding_RLE_DICTIONARY: "RLE_DICTIONARY", + ParquetEncoding_PLAIN: 'PLAIN', + ParquetEncoding_PLAIN_DICTIONARY: 'PLAIN_DICTIONARY', + ParquetEncoding_RLE: 'RLE', + ParquetEncoding_BIT_PACKED: 'BIT_PACKED', + ParquetEncoding_DELTA_BINARY_PACKED: 'DELTA_BINARY_PACKED', + ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: 'DELTA_LENGTH_BYTE_ARRAY', + ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY', + ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY', }.get(encoding_, 'UNKNOWN') +cdef compression_name_from_enum(ParquetCompression compression_): + return { + ParquetCompression_UNCOMPRESSED: 'UNCOMPRESSED', + ParquetCompression_SNAPPY: 'SNAPPY', + ParquetCompression_GZIP: 'GZIP', + ParquetCompression_LZO: 'LZO', + ParquetCompression_BROTLI: 'BROTLI', + ParquetCompression_LZ4: 'LZ4', + ParquetCompression_ZSTD: 'ZSTD', + }.get(compression_, 'UNKNOWN') + + +cdef int check_compression_name(name) except -1: + if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', + 'ZSTD'}: + raise ArrowException("Unsupported compression: " + name) + return 0 + + +cdef ParquetCompression compression_from_name(str name): + name = name.upper() + if name == 'SNAPPY': + return ParquetCompression_SNAPPY + elif name == 'GZIP': + return ParquetCompression_GZIP + elif name == 'LZO': + return ParquetCompression_LZO + elif name == 'BROTLI': + return ParquetCompression_BROTLI + elif name == 'LZ4': + return ParquetCompression_LZ4 + elif name == 'ZSTD': + return ParquetCompression_ZSTD + else: + return ParquetCompression_UNCOMPRESSED + + cdef class ParquetReader: cdef: object source @@ -654,22 +639,21 @@ cdef class ParquetReader: check_status(OpenFile(rd_handle, self.allocator, properties, c_metadata, &self.reader)) - property column_paths: - - def __get__(self): - cdef: - FileMetaData container = self.metadata - const CFileMetaData* metadata = container._metadata - vector[c_string] path - int i = 0 + @property + def column_paths(self): + cdef: + FileMetaData container = self.metadata + const CFileMetaData* metadata = container._metadata + vector[c_string] path + int i = 0 - paths = [] - for i in range(0, metadata.num_columns()): - path = (metadata.schema().Column(i) - .path().get().ToDotVector()) - paths.append([frombytes(x) for x in path]) + paths = [] + for i in range(0, metadata.num_columns()): + path = (metadata.schema().Column(i) + .path().get().ToDotVector()) + paths.append([frombytes(x) for x in path]) - return paths + return paths @property def metadata(self): @@ -686,10 +670,9 @@ cdef class ParquetReader: result.init(metadata) return result - property num_row_groups: - - def __get__(self): - return self.reader.get().num_row_groups() + @property + def num_row_groups(self): + return self.reader.get().num_row_groups() def set_num_threads(self, int nthreads): self.reader.get().set_num_threads(nthreads) @@ -809,30 +792,6 @@ cdef class ParquetReader: array.init(carray) return array -cdef int check_compression_name(name) except -1: - if name.upper() not in ['NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', - 'ZSTD']: - raise ArrowException("Unsupported compression: " + name) - return 0 - - -cdef ParquetCompression compression_from_name(str name): - name = name.upper() - if name == "SNAPPY": - return ParquetCompression_SNAPPY - elif name == "GZIP": - return ParquetCompression_GZIP - elif name == "LZO": - return ParquetCompression_LZO - elif name == "BROTLI": - return ParquetCompression_BROTLI - elif name == "LZ4": - return ParquetCompression_LZ4 - elif name == "ZSTD": - return ParquetCompression_ZSTD - else: - return ParquetCompression_UNCOMPRESSED - cdef class ParquetWriter: cdef: diff --git a/python/pyarrow/formatting.py b/python/pyarrow/formatting.py index eea3e74d881f..5ef9482ed144 100644 --- a/python/pyarrow/formatting.py +++ b/python/pyarrow/formatting.py @@ -20,6 +20,12 @@ import pyarrow.lib as lib import warnings +try: + from textwrap import indent +except ImportError: + def indent(text, prefix): + return ''.join(prefix + line for line in text.splitlines(True)) + def array_format(arr, window=10): warnings.warn("array_format is deprecated, use Array.format() instead", @@ -32,13 +38,6 @@ def value_format(x, indent_level=0): FutureWarning) if isinstance(x, lib.ListValue): contents = ',\n'.join(value_format(item) for item in x) - return '[{0}]'.format(_indent(contents, 1).strip()) + return '[{0}]'.format(indent(contents, ' ').strip()) else: return repr(x) - - -def _indent(text, spaces): - if spaces == 0: - return text - block = ' ' * spaces - return '\n'.join(block + x for x in text.split('\n')) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 9c92737bf4b5..2c1aef0e88a3 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -33,9 +33,11 @@ import numpy as np from pyarrow.filesystem import FileSystem, LocalFileSystem, S3FSWrapper -from pyarrow._parquet import (ParquetReader, FileMetaData, # noqa - RowGroupMetaData, ParquetSchema) -import pyarrow._parquet as _parquet # noqa +from pyarrow._parquet import (ParquetReader, RowGroupStatistics, # noqa + FileMetaData, RowGroupMetaData, + ColumnChunkMetaData, + ParquetSchema, ColumnSchema) +import pyarrow._parquet as _parquet import pyarrow.lib as lib import pyarrow as pa diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index c9c1a96db471..cc86ef16e08a 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -499,10 +499,13 @@ def test_pandas_parquet_configuration_options(tmpdir): tm.assert_frame_equal(df, df_read) -def make_sample_file(df): +def make_sample_file(table_or_df): import pyarrow.parquet as pq - a_table = pa.Table.from_pandas(df) + if isinstance(table_or_df, pa.Table): + a_table = table_or_df + else: + a_table = pa.Table.from_pandas(table_or_df) buf = io.BytesIO() _write_table(a_table, buf, compression='SNAPPY', version='2.0', @@ -513,6 +516,8 @@ def make_sample_file(df): def test_parquet_metadata_api(): + import pyarrow.parquet as pq + df = alltypes_sample(size=10000) df = df.reindex(columns=sorted(df.columns)) @@ -527,6 +532,8 @@ def test_parquet_metadata_api(): assert meta.num_row_groups == 1 assert meta.format_version == '2.0' assert 'parquet-cpp' in meta.created_by + assert isinstance(meta.serialized_size, int) + assert isinstance(meta.metadata, dict) # Schema schema = fileh.schema @@ -553,45 +560,85 @@ def test_parquet_metadata_api(): # Row group for rg in range(meta.num_row_groups): rg_meta = meta.row_group(rg) + assert isinstance(rg_meta, pq.RowGroupMetaData) repr(rg_meta) for col in range(rg_meta.num_columns): col_meta = rg_meta.column(col) + assert isinstance(col_meta, pq.ColumnChunkMetaData) repr(col_meta) + rg_meta = meta.row_group(0) assert rg_meta.num_rows == len(df) assert rg_meta.num_columns == ncols + 1 # +1 for index + assert rg_meta.total_byte_size > 0 + + col_meta = rg_meta.column(0) + assert col_meta.file_offset > 0 + assert col_meta.file_path == '' # created from BytesIO + assert col_meta.physical_type == 'BOOLEAN' + assert col_meta.num_values == 10000 + assert col_meta.path_in_schema == 'bool' + assert col_meta.is_stats_set is True + assert isinstance(col_meta.statistics, pq.RowGroupStatistics) + assert col_meta.compression == 'SNAPPY' + assert col_meta.encodings == ('PLAIN', 'RLE') + assert col_meta.has_dictionary_page is False + assert col_meta.dictionary_page_offset is None + assert col_meta.data_page_offset > 0 + assert col_meta.total_compressed_size > 0 + assert col_meta.total_uncompressed_size > 0 + with pytest.raises(NotImplementedError): + col_meta.has_index_page + with pytest.raises(NotImplementedError): + col_meta.index_page_offset @pytest.mark.parametrize( - 'data, dtype, min_value, max_value, null_count, num_values', + ( + 'data', + 'type', + 'physical_type', + 'min_value', + 'max_value', + 'null_count', + 'num_values', + 'distinct_count' + ), [ - ([1, 2, 2, None, 4], np.uint8, 1, 4, 1, 4), - ([1, 2, 2, None, 4], np.uint16, 1, 4, 1, 4), - ([1, 2, 2, None, 4], np.uint32, 1, 4, 1, 4), - ([1, 2, 2, None, 4], np.uint64, 1, 4, 1, 4), - ([-1, 2, 2, None, 4], np.int16, -1, 4, 1, 4), - ([-1, 2, 2, None, 4], np.int32, -1, 4, 1, 4), - ([-1, 2, 2, None, 4], np.int64, -1, 4, 1, 4), - ([-1.1, 2.2, 2.3, None, 4.4], np.float32, -1.1, 4.4, 1, 4), - ([-1.1, 2.2, 2.3, None, 4.4], np.float64, -1.1, 4.4, 1, 4), + ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0), + ( + [-1.1, 2.2, 2.3, None, 4.4], pa.float32(), + 'FLOAT', -1.1, 4.4, 1, 4, 0 + ), ( - [u'', u'b', unichar(1000), None, u'aaa'], - object, b'', unichar(1000).encode('utf-8'), 1, 4 + [-1.1, 2.2, 2.3, None, 4.4], pa.float64(), + 'DOUBLE', -1.1, 4.4, 1, 4, 0 + ), + ( + [u'', u'b', unichar(1000), None, u'aaa'], pa.binary(), + 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0 + ), + ( + [True, False, False, True, True], pa.bool_(), + 'BOOLEAN', False, True, 0, 5, 0 ), - ([True, False, False, True, True], np.bool, False, True, 0, 5), ] ) -def test_parquet_column_statistics_api( - data, - dtype, - min_value, - max_value, - null_count, - num_values): - df = pd.DataFrame({'data': data}, dtype=dtype) - - fileh = make_sample_file(df) +def test_parquet_column_statistics_api(data, type, physical_type, min_value, + max_value, null_count, num_values, + distinct_count): + df = pd.DataFrame({'data': data}) + schema = pa.schema([pa.field('data', type)]) + table = pa.Table.from_pandas(df, schema=schema) + fileh = make_sample_file(table) meta = fileh.metadata @@ -599,26 +646,43 @@ def test_parquet_column_statistics_api( col_meta = rg_meta.column(0) stat = col_meta.statistics + assert stat.has_min_max assert stat.min == min_value assert stat.max == max_value assert stat.null_count == null_count assert stat.num_values == num_values + # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount + # method, missing distinct_count is represented as zero instead of None + assert stat.distinct_count == distinct_count + assert stat.physical_type == physical_type def test_compare_schemas(): + import pyarrow.parquet as pq + df = alltypes_sample(size=10000) fileh = make_sample_file(df) fileh2 = make_sample_file(df) fileh3 = make_sample_file(df[df.columns[::2]]) + # ParquetSchema + assert isinstance(fileh.schema, pq.ParquetSchema) assert fileh.schema.equals(fileh.schema) + assert fileh.schema == fileh.schema assert fileh.schema.equals(fileh2.schema) - + assert fileh.schema == fileh2.schema + assert fileh.schema != 'arbitrary object' assert not fileh.schema.equals(fileh3.schema) + assert fileh.schema != fileh3.schema + # ColumnSchema + assert isinstance(fileh.schema[0], pq.ColumnSchema) assert fileh.schema[0].equals(fileh.schema[0]) + assert fileh.schema[0] == fileh.schema[0] assert not fileh.schema[0].equals(fileh.schema[1]) + assert fileh.schema[0] != fileh.schema[1] + assert fileh.schema[0] != 'arbitrary object' def test_column_of_arrays(tmpdir):