From c0e2a08cad194990fcdf665184615446f6c08a9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 28 Jun 2018 17:05:47 +0200 Subject: [PATCH 01/16] replace old property syntax; move indent to formatting; test missing column stat properties --- python/pyarrow/_parquet.pxd | 2 +- python/pyarrow/_parquet.pyx | 408 ++++++++++++--------------- python/pyarrow/formatting.py | 15 +- python/pyarrow/tests/test_parquet.py | 52 ++-- 4 files changed, 221 insertions(+), 256 deletions(-) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index ca20ce21b78c..7fdd66bb7ae8 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -41,7 +41,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: - enum ParquetType" parquet::Type::type": + cpdef enum ParquetType" parquet::Type::type": ParquetType_BOOLEAN" parquet::Type::BOOLEAN" ParquetType_INT32" parquet::Type::INT32" ParquetType_INT64" parquet::Type::INT64" diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 983ff8d8a9a4..65deb730a05a 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -31,16 +31,11 @@ from pyarrow.lib cimport (Array, Schema, NativeFile, get_reader, get_writer) from pyarrow.compat import tobytes, frombytes +from pyarrow.formatting import indent from pyarrow.lib import ArrowException, NativeFile, _stringify_path import six - -try: - from textwrap import indent -except ImportError: - def indent(text, prefix): - lines = [prefix + line for line in text.splitlines(True)] - return ''.join(lines) +import warnings cdef class RowGroupStatistics: @@ -95,49 +90,42 @@ cdef class RowGroupStatistics: else: raise ValueError('Unknown physical ParquetType') - property has_min_max: - - def __get__(self): - return self.statistics.get().HasMinMax() - - property min: - - def __get__(self): - raw_physical_type = self.statistics.get().physical_type() - encode_min = self.statistics.get().EncodeMin() - - min_value = FormatStatValue(raw_physical_type, encode_min.c_str()) - return self._cast_statistic(min_value) - - property max: - - def __get__(self): - raw_physical_type = self.statistics.get().physical_type() - encode_max = self.statistics.get().EncodeMax() + @property + def has_min_max(self): + return self.statistics.get().HasMinMax() - max_value = FormatStatValue(raw_physical_type, encode_max.c_str()) - return self._cast_statistic(max_value) + @property + def min(self): + raw_physical_type = self.statistics.get().physical_type() + encode_min = self.statistics.get().EncodeMin() - property null_count: + min_value = FormatStatValue(raw_physical_type, encode_min.c_str()) + return self._cast_statistic(min_value) - def __get__(self): - return self.statistics.get().null_count() + @property + def max(self): + raw_physical_type = self.statistics.get().physical_type() + encode_max = self.statistics.get().EncodeMax() - property distinct_count: + max_value = FormatStatValue(raw_physical_type, encode_max.c_str()) + return self._cast_statistic(max_value) - def __get__(self): - return self.statistics.get().distinct_count() - - property num_values: + @property + def null_count(self): + return self.statistics.get().null_count() - def __get__(self): - return self.statistics.get().num_values() + @property + def distinct_count(self): + return self.statistics.get().distinct_count() - property physical_type: + @property + def num_values(self): + return self.statistics.get().num_values() - def __get__(self): - physical_type = self.statistics.get().physical_type() - return physical_type_name_from_enum(physical_type) + @property + def physical_type(self): + raw_physical_type = self.statistics.get().physical_type() + return physical_type_name_from_enum(raw_physical_type) cdef class ColumnChunkMetaData: @@ -187,86 +175,72 @@ cdef class ColumnChunkMetaData: self.total_compressed_size, self.total_uncompressed_size) - property file_offset: - - def __get__(self): - return self.metadata.file_offset() - - property file_path: - - def __get__(self): - return frombytes(self.metadata.file_path()) - - property type: - - def __get__(self): - return physical_type_name_from_enum(self.metadata.type()) - - property num_values: - - def __get__(self): - return self.metadata.num_values() - - property path_in_schema: - - def __get__(self): - path = self.metadata.path_in_schema().get().ToDotString() - return frombytes(path) - - property is_stats_set: - - def __get__(self): - return self.metadata.is_stats_set() - - property statistics: - - def __get__(self): - if not self.metadata.is_stats_set(): - return None - statistics = RowGroupStatistics() - statistics.init(self.metadata.statistics()) - return statistics - - property compression: - - def __get__(self): - return self.metadata.compression() + @property + def file_offset(self): + return self.metadata.file_offset() - property encodings: + @property + def file_path(self): + return frombytes(self.metadata.file_path()) - def __get__(self): - return map(encoding_name_from_enum, - self.metadata.encodings()) + @property + def type(self): + # XXX: shouldn't this be called pshysical_type like in + # RowGroupStatistics? + return physical_type_name_from_enum(self.metadata.type()) - property has_dictionary_page: + @property + def num_values(self): + return self.metadata.num_values() - def __get__(self): - return self.metadata.has_dictionary_page() + @property + def path_in_schema(self): + path = self.metadata.path_in_schema().get().ToDotString() + return frombytes(path) - property dictionary_page_offset: + @property + def is_stats_set(self): + return self.metadata.is_stats_set() - def __get__(self): - return self.metadata.dictionary_page_offset() + @property + def statistics(self): + if not self.metadata.is_stats_set(): + return None + statistics = RowGroupStatistics() + statistics.init(self.metadata.statistics()) + return statistics - property data_page_offset: + @property + def compression(self): + return self.metadata.compression() - def __get__(self): - return self.metadata.data_page_offset() + @property + def encodings(self): + return map(encoding_name_from_enum, self.metadata.encodings()) - property index_page_offset: + @property + def has_dictionary_page(self): + return self.metadata.has_dictionary_page() - def __get__(self): - return self.metadata.index_page_offset() + @property + def dictionary_page_offset(self): + return self.metadata.dictionary_page_offset() - property total_compressed_size: + @property + def data_page_offset(self): + return self.metadata.data_page_offset() - def __get__(self): - return self.metadata.total_compressed_size() + @property + def index_page_offset(self): + return self.metadata.index_page_offset() - property total_uncompressed_size: + @property + def total_compressed_size(self): + return self.metadata.total_compressed_size() - def __get__(self): - return self.metadata.total_uncompressed_size() + @property + def total_uncompressed_size(self): + return self.metadata.total_uncompressed_size() cdef class RowGroupMetaData: @@ -299,20 +273,17 @@ cdef class RowGroupMetaData: self.num_rows, self.total_byte_size) - property num_columns: - - def __get__(self): - return self.metadata.num_columns() - - property num_rows: - - def __get__(self): - return self.metadata.num_rows() + @property + def num_columns(self): + return self.metadata.num_columns() - property total_byte_size: + @property + def num_rows(self): + return self.metadata.num_rows() - def __get__(self): - return self.metadata.total_byte_size() + @property + def total_byte_size(self): + return self.metadata.total_byte_size() cdef class FileMetaData: @@ -351,43 +322,37 @@ cdef class FileMetaData: self._schema = schema return schema - property serialized_size: - - def __get__(self): - return self._metadata.size() - - property num_columns: - - def __get__(self): - return self._metadata.num_columns() - - property num_rows: - - def __get__(self): - return self._metadata.num_rows() - - property num_row_groups: + @property + def serialized_size(self): + return self._metadata.size() - def __get__(self): - return self._metadata.num_row_groups() + @property + def num_columns(self): + return self._metadata.num_columns() - property format_version: + @property + def num_rows(self): + return self._metadata.num_rows() - def __get__(self): - cdef ParquetVersion version = self._metadata.version() - if version == ParquetVersion_V1: - return '1.0' - if version == ParquetVersion_V2: - return '2.0' - else: - print('Unrecognized file version, assuming 1.0: {0}' - .format(version)) - return '1.0' + @property + def num_row_groups(self): + return self._metadata.num_row_groups() - property created_by: + @property + def format_version(self): + cdef ParquetVersion version = self._metadata.version() + if version == ParquetVersion_V1: + return '1.0' + if version == ParquetVersion_V2: + return '2.0' + else: + # TODO(kszucs) warn instead of print + print('Unrecognized file version, assuming 1.0: {}'.format(version)) + return '1.0' - def __get__(self): - return frombytes(self._metadata.created_by()) + @property + def created_by(self): + return frombytes(self._metadata.created_by()) def row_group(self, int i): """ @@ -444,10 +409,9 @@ cdef class ParquetSchema: def __getitem__(self, i): return self.column(i) - property names: - - def __get__(self): - return [self[i].name for i in range(len(self))] + @property + def names(self): + return [self[i].name for i in range(len(self))] def to_arrow_schema(self): """ @@ -457,8 +421,7 @@ cdef class ParquetSchema: ------- schema : pyarrow.Schema """ - cdef: - shared_ptr[CSchema] sp_arrow_schema + cdef shared_ptr[CSchema] sp_arrow_schema with nogil: check_status(FromParquetSchema( @@ -467,6 +430,8 @@ cdef class ParquetSchema: return pyarrow_wrap_schema(sp_arrow_schema) + # TODO(kszucs): impl __eq__ + def equals(self, ParquetSchema other): """ Returns True if the Parquet schemas are equal @@ -477,9 +442,7 @@ cdef class ParquetSchema: if i < 0 or i >= len(self): raise IndexError('{0} out of bounds'.format(i)) - cdef ColumnSchema col = ColumnSchema() - col.init_from_schema(self, i) - return col + return ColumnSchema(self, i) cdef class ColumnSchema: @@ -487,13 +450,12 @@ cdef class ColumnSchema: ParquetSchema parent const ColumnDescriptor* descr - def __cinit__(self): - self.descr = NULL - - cdef init_from_schema(self, ParquetSchema schema, int i): - self.parent = schema + def __cinit__(self, ParquetSchema schema, int i): + self.parent = schema # XXX why is it needed? ownership? self.descr = schema.schema.Column(i) + # TODO(kszucs) impl __eq__ + def equals(self, ColumnSchema other): """ Returns True if the column schemas are equal @@ -520,52 +482,43 @@ cdef class ColumnSchema: self.max_repetition_level, physical_type, logical_type) - property name: - - def __get__(self): - return frombytes(self.descr.name()) - - property path: - - def __get__(self): - return frombytes(self.descr.path().get().ToDotString()) - - property max_definition_level: - - def __get__(self): - return self.descr.max_definition_level() - - property max_repetition_level: + @property + def name(self): + return frombytes(self.descr.name()) - def __get__(self): - return self.descr.max_repetition_level() + @property + def path(self): + return frombytes(self.descr.path().get().ToDotString()) - property physical_type: + @property + def max_definition_level(self): + return self.descr.max_definition_level() - def __get__(self): - return physical_type_name_from_enum(self.descr.physical_type()) + @property + def max_repetition_level(self): + return self.descr.max_repetition_level() - property logical_type: + @property + def physical_type(self): + return physical_type_name_from_enum(self.descr.physical_type()) - def __get__(self): - return logical_type_name_from_enum(self.descr.logical_type()) + @property + def logical_type(self): + return logical_type_name_from_enum(self.descr.logical_type()) # FIXED_LEN_BYTE_ARRAY attribute - property length: - - def __get__(self): - return self.descr.type_length() + @property + def length(self): + return self.descr.type_length() # Decimal attributes - property precision: - - def __get__(self): - return self.descr.type_precision() - - property scale: + @property + def precision(self): + return self.descr.type_precision() - def __get__(self): - return self.descr.type_scale() + @property + def scale(self): + return self.descr.type_scale() cdef physical_type_name_from_enum(ParquetType type_): @@ -654,22 +607,21 @@ cdef class ParquetReader: check_status(OpenFile(rd_handle, self.allocator, properties, c_metadata, &self.reader)) - property column_paths: - - def __get__(self): - cdef: - FileMetaData container = self.metadata - const CFileMetaData* metadata = container._metadata - vector[c_string] path - int i = 0 + @property + def column_paths(self): + cdef: + FileMetaData container = self.metadata + const CFileMetaData* metadata = container._metadata + vector[c_string] path + int i = 0 - paths = [] - for i in range(0, metadata.num_columns()): - path = (metadata.schema().Column(i) - .path().get().ToDotVector()) - paths.append([frombytes(x) for x in path]) + paths = [] + for i in range(0, metadata.num_columns()): + path = (metadata.schema().Column(i) + .path().get().ToDotVector()) + paths.append([frombytes(x) for x in path]) - return paths + return paths @property def metadata(self): @@ -686,10 +638,9 @@ cdef class ParquetReader: result.init(metadata) return result - property num_row_groups: - - def __get__(self): - return self.reader.get().num_row_groups() + @property + def num_row_groups(self): + return self.reader.get().num_row_groups() def set_num_threads(self, int nthreads): self.reader.get().set_num_threads(nthreads) @@ -809,26 +760,27 @@ cdef class ParquetReader: array.init(carray) return array + cdef int check_compression_name(name) except -1: - if name.upper() not in ['NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', - 'ZSTD']: + if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', + 'ZSTD'}: raise ArrowException("Unsupported compression: " + name) return 0 cdef ParquetCompression compression_from_name(str name): name = name.upper() - if name == "SNAPPY": + if name == 'SNAPPY': return ParquetCompression_SNAPPY - elif name == "GZIP": + elif name == 'GZIP': return ParquetCompression_GZIP - elif name == "LZO": + elif name == 'LZO': return ParquetCompression_LZO - elif name == "BROTLI": + elif name == 'BROTLI': return ParquetCompression_BROTLI - elif name == "LZ4": + elif name == 'LZ4': return ParquetCompression_LZ4 - elif name == "ZSTD": + elif name == 'ZSTD': return ParquetCompression_ZSTD else: return ParquetCompression_UNCOMPRESSED diff --git a/python/pyarrow/formatting.py b/python/pyarrow/formatting.py index eea3e74d881f..5ef9482ed144 100644 --- a/python/pyarrow/formatting.py +++ b/python/pyarrow/formatting.py @@ -20,6 +20,12 @@ import pyarrow.lib as lib import warnings +try: + from textwrap import indent +except ImportError: + def indent(text, prefix): + return ''.join(prefix + line for line in text.splitlines(True)) + def array_format(arr, window=10): warnings.warn("array_format is deprecated, use Array.format() instead", @@ -32,13 +38,6 @@ def value_format(x, indent_level=0): FutureWarning) if isinstance(x, lib.ListValue): contents = ',\n'.join(value_format(item) for item in x) - return '[{0}]'.format(_indent(contents, 1).strip()) + return '[{0}]'.format(indent(contents, ' ').strip()) else: return repr(x) - - -def _indent(text, spaces): - if spaces == 0: - return text - block = ' ' * spaces - return '\n'.join(block + x for x in text.split('\n')) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index c9c1a96db471..b77ad34ae0a8 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -564,31 +564,42 @@ def test_parquet_metadata_api(): @pytest.mark.parametrize( - 'data, dtype, min_value, max_value, null_count, num_values', + ( + 'data', + 'dtype', + 'physical_type', + 'min_value', + 'max_value', + 'null_count', + 'num_values', + 'distinct_count' + ), [ - ([1, 2, 2, None, 4], np.uint8, 1, 4, 1, 4), - ([1, 2, 2, None, 4], np.uint16, 1, 4, 1, 4), - ([1, 2, 2, None, 4], np.uint32, 1, 4, 1, 4), - ([1, 2, 2, None, 4], np.uint64, 1, 4, 1, 4), - ([-1, 2, 2, None, 4], np.int16, -1, 4, 1, 4), - ([-1, 2, 2, None, 4], np.int32, -1, 4, 1, 4), - ([-1, 2, 2, None, 4], np.int64, -1, 4, 1, 4), - ([-1.1, 2.2, 2.3, None, 4.4], np.float32, -1.1, 4.4, 1, 4), - ([-1.1, 2.2, 2.3, None, 4.4], np.float64, -1.1, 4.4, 1, 4), + ([1, 2, 2, None, 4], np.uint8, 'INT64', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], np.uint16, 'INT64', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], np.uint32, 'INT64', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], np.uint64, 'INT64', 1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], np.int16, 'INT64', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], np.int32, 'INT64', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], np.int64, 'INT64', -1, 4, 1, 4, 0), + ([-1.1, 2.2, 2.3, None, 4.4], np.float32, 'FLOAT', -1.1, 4.4, 1, 4, 0), + ( + [-1.1, 2.2, 2.3, None, 4.4], + np.float64, 'DOUBLE', -1.1, 4.4, 1, 4, 0 + ), ( [u'', u'b', unichar(1000), None, u'aaa'], - object, b'', unichar(1000).encode('utf-8'), 1, 4 + object, 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0 + ), + ( + [True, False, False, True, True], + np.bool, 'BOOLEAN', False, True, 0, 5, 0 ), - ([True, False, False, True, True], np.bool, False, True, 0, 5), ] ) -def test_parquet_column_statistics_api( - data, - dtype, - min_value, - max_value, - null_count, - num_values): +def test_parquet_column_statistics_api(data, dtype, physical_type, min_value, + max_value, null_count, num_values, + distinct_count): df = pd.DataFrame({'data': data}, dtype=dtype) fileh = make_sample_file(df) @@ -599,10 +610,13 @@ def test_parquet_column_statistics_api( col_meta = rg_meta.column(0) stat = col_meta.statistics + assert stat.has_min_max assert stat.min == min_value assert stat.max == max_value assert stat.null_count == null_count assert stat.num_values == num_values + assert stat.distinct_count == distinct_count + assert stat.physical_type == physical_type def test_compare_schemas(): From 06e4f8ed370ddf7db49dfac7d978e0927306d791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 28 Jun 2018 17:07:49 +0200 Subject: [PATCH 02/16] remove enum cpdef --- python/pyarrow/_parquet.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 7fdd66bb7ae8..ca20ce21b78c 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -41,7 +41,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: - cpdef enum ParquetType" parquet::Type::type": + enum ParquetType" parquet::Type::type": ParquetType_BOOLEAN" parquet::Type::BOOLEAN" ParquetType_INT32" parquet::Type::INT32" ParquetType_INT64" parquet::Type::INT64" From c58441a45c9c146c4786f7ae7bed0d0259342d3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 28 Jun 2018 17:21:39 +0200 Subject: [PATCH 03/16] due to receiving extension class as argument it's possuble to use directly __cinit__ constructors instead of custom init methods --- python/pyarrow/_parquet.pyx | 56 +++++++++++++------------------------ 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 65deb730a05a..773dd541d9c6 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -249,10 +249,7 @@ cdef class RowGroupMetaData: CRowGroupMetaData* metadata FileMetaData parent - def __cinit__(self): - pass - - cdef void init_from_file(self, FileMetaData parent, int i): + def __cinit__(self, FileMetaData parent, int i): if i < 0 or i >= parent.num_row_groups: raise IndexError('{0} out of bounds'.format(i)) self.up_metadata = parent._metadata.RowGroup(i) @@ -314,13 +311,9 @@ cdef class FileMetaData: @property def schema(self): - if self._schema is not None: - return self._schema - - cdef ParquetSchema schema = ParquetSchema() - schema.init_from_filemeta(self) - self._schema = schema - return schema + if self._schema is None: + self._schema = ParquetSchema(self) + return self._schema @property def serialized_size(self): @@ -354,26 +347,20 @@ cdef class FileMetaData: def created_by(self): return frombytes(self._metadata.created_by()) - def row_group(self, int i): - """ - - """ - cdef RowGroupMetaData result = RowGroupMetaData() - result.init_from_file(self, i) - return result - - property metadata: + @property + def metadata(self): + cdef: + unordered_map[c_string, c_string] metadata + const CKeyValueMetadata* underlying_metadata + underlying_metadata = self._metadata.key_value_metadata().get() + if underlying_metadata != NULL: + underlying_metadata.ToUnorderedMap(&metadata) + return metadata + else: + return None - def __get__(self): - cdef: - unordered_map[c_string, c_string] metadata - const CKeyValueMetadata* underlying_metadata - underlying_metadata = self._metadata.key_value_metadata().get() - if underlying_metadata != NULL: - underlying_metadata.ToUnorderedMap(&metadata) - return metadata - else: - return None + def row_group(self, int i): + return RowGroupMetaData(self, i) cdef class ParquetSchema: @@ -381,8 +368,9 @@ cdef class ParquetSchema: FileMetaData parent # the FileMetaData owning the SchemaDescriptor const SchemaDescriptor* schema - def __cinit__(self): - self.schema = NULL + def __cinit__(self, FileMetaData container): + self.parent = container + self.schema = container._metadata.schema() def __repr__(self): cdef const ColumnDescriptor* descr @@ -399,10 +387,6 @@ cdef class ParquetSchema: {1} """.format(object.__repr__(self), '\n'.join(elements)) - cdef init_from_filemeta(self, FileMetaData container): - self.parent = container - self.schema = container._metadata.schema() - def __len__(self): return self.schema.num_columns() From 266a022b0f7f5b425edc1530112d6ce3e4a287b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 28 Jun 2018 18:20:27 +0200 Subject: [PATCH 04/16] implement equality operator on ParquetSchema and ColumnSchema --- python/pyarrow/_parquet.pyx | 12 ++++++++++-- python/pyarrow/tests/test_parquet.py | 10 +++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 773dd541d9c6..1f60014bd728 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -414,7 +414,11 @@ cdef class ParquetSchema: return pyarrow_wrap_schema(sp_arrow_schema) - # TODO(kszucs): impl __eq__ + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return False def equals(self, ParquetSchema other): """ @@ -438,7 +442,11 @@ cdef class ColumnSchema: self.parent = schema # XXX why is it needed? ownership? self.descr = schema.schema.Column(i) - # TODO(kszucs) impl __eq__ + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return False def equals(self, ColumnSchema other): """ diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index b77ad34ae0a8..8b2e11bc7536 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -527,6 +527,8 @@ def test_parquet_metadata_api(): assert meta.num_row_groups == 1 assert meta.format_version == '2.0' assert 'parquet-cpp' in meta.created_by + assert isinstance(meta.serialized_size, int) + assert isinstance(meta.metadata, dict) # Schema schema = fileh.schema @@ -627,12 +629,18 @@ def test_compare_schemas(): fileh3 = make_sample_file(df[df.columns[::2]]) assert fileh.schema.equals(fileh.schema) + assert fileh.schema == fileh.schema assert fileh.schema.equals(fileh2.schema) - + assert fileh.schema == fileh2.schema + assert fileh.schema != 'arbitrary object' assert not fileh.schema.equals(fileh3.schema) + assert fileh.schema != fileh3.schema assert fileh.schema[0].equals(fileh.schema[0]) + assert fileh.schema[0] == fileh.schema[0] assert not fileh.schema[0].equals(fileh.schema[1]) + assert fileh.schema[0] != fileh.schema[1] + assert fileh.schema[0] != 'arbitrary object' def test_column_of_arrays(tmpdir): From a43153e960288d67ba44d127fae1f25972f11534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 28 Jun 2018 19:28:32 +0200 Subject: [PATCH 05/16] warn instead of print --- python/pyarrow/_parquet.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 1f60014bd728..369023cc46a0 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -339,8 +339,8 @@ cdef class FileMetaData: if version == ParquetVersion_V2: return '2.0' else: - # TODO(kszucs) warn instead of print - print('Unrecognized file version, assuming 1.0: {}'.format(version)) + warnings.warn('Unrecognized file version, assuming 1.0: {}' + .format(version)) return '1.0' @property From dd362295a80c0542be23c34c287051f036547dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 29 Jun 2018 08:44:46 +0200 Subject: [PATCH 06/16] rename type property to physical_type --- python/pyarrow/_parquet.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 369023cc46a0..cebcd0d3bffe 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -184,9 +184,7 @@ cdef class ColumnChunkMetaData: return frombytes(self.metadata.file_path()) @property - def type(self): - # XXX: shouldn't this be called pshysical_type like in - # RowGroupStatistics? + def physical_type(self): return physical_type_name_from_enum(self.metadata.type()) @property From d55b96a8131bc51b17525dac8a9364d0a9899ee1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 4 Jul 2018 12:29:29 +0200 Subject: [PATCH 07/16] fix types --- python/pyarrow/_parquet.pyx | 4 +-- python/pyarrow/tests/test_parquet.py | 50 ++++++++++++++++------------ 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index cebcd0d3bffe..530f3987066c 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -145,7 +145,7 @@ cdef class ColumnChunkMetaData: return """{0} file_offset: {1} file_path: {2} - type: {3} + physical_type: {3} num_values: {4} path_in_schema: {5} is_stats_set: {6} @@ -161,7 +161,7 @@ cdef class ColumnChunkMetaData: total_uncompressed_size: {15}""".format(object.__repr__(self), self.file_offset, self.file_path, - self.type, + self.physical_type, self.num_values, self.path_in_schema, self.is_stats_set, diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 8b2e11bc7536..f7ad30771eb6 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -499,10 +499,13 @@ def test_pandas_parquet_configuration_options(tmpdir): tm.assert_frame_equal(df, df_read) -def make_sample_file(df): +def make_sample_file(table_or_df): import pyarrow.parquet as pq - a_table = pa.Table.from_pandas(df) + if isinstance(table_or_df, pa.Table): + a_table = table_or_df + else: + a_table = pa.Table.from_pandas(table_or_df) buf = io.BytesIO() _write_table(a_table, buf, compression='SNAPPY', version='2.0', @@ -568,7 +571,7 @@ def test_parquet_metadata_api(): @pytest.mark.parametrize( ( 'data', - 'dtype', + 'type', 'physical_type', 'min_value', 'max_value', @@ -577,34 +580,39 @@ def test_parquet_metadata_api(): 'distinct_count' ), [ - ([1, 2, 2, None, 4], np.uint8, 'INT64', 1, 4, 1, 4, 0), - ([1, 2, 2, None, 4], np.uint16, 'INT64', 1, 4, 1, 4, 0), - ([1, 2, 2, None, 4], np.uint32, 'INT64', 1, 4, 1, 4, 0), - ([1, 2, 2, None, 4], np.uint64, 'INT64', 1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], np.int16, 'INT64', -1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], np.int32, 'INT64', -1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], np.int64, 'INT64', -1, 4, 1, 4, 0), - ([-1.1, 2.2, 2.3, None, 4.4], np.float32, 'FLOAT', -1.1, 4.4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0), + ( + [-1.1, 2.2, 2.3, None, 4.4], pa.float32(), + 'FLOAT', -1.1, 4.4, 1, 4, 0 + ), ( - [-1.1, 2.2, 2.3, None, 4.4], - np.float64, 'DOUBLE', -1.1, 4.4, 1, 4, 0 + [-1.1, 2.2, 2.3, None, 4.4], pa.float64(), + 'DOUBLE', -1.1, 4.4, 1, 4, 0 ), ( - [u'', u'b', unichar(1000), None, u'aaa'], - object, 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0 + [u'', u'b', unichar(1000), None, u'aaa'], pa.binary(), + 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0 ), ( - [True, False, False, True, True], - np.bool, 'BOOLEAN', False, True, 0, 5, 0 + [True, False, False, True, True], pa.bool_(), + 'BOOLEAN', False, True, 0, 5, 0 ), ] ) -def test_parquet_column_statistics_api(data, dtype, physical_type, min_value, +def test_parquet_column_statistics_api(data, type, physical_type, min_value, max_value, null_count, num_values, distinct_count): - df = pd.DataFrame({'data': data}, dtype=dtype) - - fileh = make_sample_file(df) + df = pd.DataFrame({'data': data}) + schema = pa.schema([pa.field('data', type)]) + table = pa.Table.from_pandas(df, schema=schema) + fileh = make_sample_file(table) meta = fileh.metadata From ffa104136cd845b54dce233f26869610be427a56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 4 Jul 2018 13:18:55 +0200 Subject: [PATCH 08/16] expected distinct_count value is None --- python/pyarrow/_parquet.pyx | 2 +- python/pyarrow/tests/test_parquet.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 530f3987066c..722d2e0226cb 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -437,7 +437,7 @@ cdef class ColumnSchema: const ColumnDescriptor* descr def __cinit__(self, ParquetSchema schema, int i): - self.parent = schema # XXX why is it needed? ownership? + self.parent = schema self.descr = schema.schema.Column(i) def __eq__(self, other): diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index f7ad30771eb6..cdc52cf48162 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -580,29 +580,29 @@ def test_parquet_metadata_api(): 'distinct_count' ), [ - ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0), - ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0), - ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0), - ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0), - ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, None), + ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, None), + ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, None), + ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, None), + ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, None), + ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, None), + ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, None), + ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, None), ( [-1.1, 2.2, 2.3, None, 4.4], pa.float32(), - 'FLOAT', -1.1, 4.4, 1, 4, 0 + 'FLOAT', -1.1, 4.4, 1, 4, None ), ( [-1.1, 2.2, 2.3, None, 4.4], pa.float64(), - 'DOUBLE', -1.1, 4.4, 1, 4, 0 + 'DOUBLE', -1.1, 4.4, 1, 4, None ), ( [u'', u'b', unichar(1000), None, u'aaa'], pa.binary(), - 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0 + 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, None ), ( [True, False, False, True, True], pa.bool_(), - 'BOOLEAN', False, True, 0, 5, 0 + 'BOOLEAN', False, True, 0, 5, None ), ] ) From 1e1c7cd60ede886ece7d1b200c23a05b6988f77e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 25 Jul 2018 11:12:21 +0200 Subject: [PATCH 09/16] test and fix column chunk metadata properties --- python/pyarrow/_parquet.pyx | 86 ++++++++++++++++------------ python/pyarrow/tests/test_parquet.py | 24 ++++++++ 2 files changed, 73 insertions(+), 37 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 722d2e0226cb..cb9f17df472f 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -210,15 +210,15 @@ cdef class ColumnChunkMetaData: @property def compression(self): - return self.metadata.compression() + return compression_name_from_enum(self.metadata.compression()) @property def encodings(self): - return map(encoding_name_from_enum, self.metadata.encodings()) + return tuple(map(encoding_name_from_enum, self.metadata.encodings())) @property def has_dictionary_page(self): - return self.metadata.has_dictionary_page() + return bool(self.metadata.has_dictionary_page()) @property def dictionary_page_offset(self): @@ -552,19 +552,56 @@ cdef logical_type_name_from_enum(ParquetLogicalType type_): }.get(type_, 'UNKNOWN') -cdef encoding_name_from_enum (ParquetEncoding encoding_): +cdef encoding_name_from_enum(ParquetEncoding encoding_): return { - ParquetEncoding_PLAIN: "PLAIN", - ParquetEncoding_PLAIN_DICTIONARY: "PLAIN_DICTIONARY", - ParquetEncoding_RLE: "RLE", - ParquetEncoding_BIT_PACKED: "BIT_PACKED", - ParquetEncoding_DELTA_BINARY_PACKED: "DELTA_BINARY_PACKED", - ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: "DELTA_LENGTH_BYTE_ARRAY", - ParquetEncoding_DELTA_BYTE_ARRAY: "DELTA_BYTE_ARRAY", - ParquetEncoding_RLE_DICTIONARY: "RLE_DICTIONARY", + ParquetEncoding_PLAIN: 'PLAIN', + ParquetEncoding_PLAIN_DICTIONARY: 'PLAIN_DICTIONARY', + ParquetEncoding_RLE: 'RLE', + ParquetEncoding_BIT_PACKED: 'BIT_PACKED', + ParquetEncoding_DELTA_BINARY_PACKED: 'DELTA_BINARY_PACKED', + ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: 'DELTA_LENGTH_BYTE_ARRAY', + ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY', + ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY', }.get(encoding_, 'UNKNOWN') +cdef compression_name_from_enum(ParquetCompression compression_): + return { + ParquetCompression_UNCOMPRESSED: 'UNCOMPRESSED', + ParquetCompression_SNAPPY: 'SNAPPY', + ParquetCompression_GZIP: 'GZIP', + ParquetCompression_LZO: 'LZO', + ParquetCompression_BROTLI: 'BROTLI', + ParquetCompression_LZ4: 'LZ4', + ParquetCompression_ZSTD: 'ZSTD', + }.get(compression_, 'UNKNOWN') + + +cdef int check_compression_name(name) except -1: + if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', + 'ZSTD'}: + raise ArrowException("Unsupported compression: " + name) + return 0 + + +cdef ParquetCompression compression_from_name(str name): + name = name.upper() + if name == 'SNAPPY': + return ParquetCompression_SNAPPY + elif name == 'GZIP': + return ParquetCompression_GZIP + elif name == 'LZO': + return ParquetCompression_LZO + elif name == 'BROTLI': + return ParquetCompression_BROTLI + elif name == 'LZ4': + return ParquetCompression_LZ4 + elif name == 'ZSTD': + return ParquetCompression_ZSTD + else: + return ParquetCompression_UNCOMPRESSED + + cdef class ParquetReader: cdef: object source @@ -751,31 +788,6 @@ cdef class ParquetReader: return array -cdef int check_compression_name(name) except -1: - if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', - 'ZSTD'}: - raise ArrowException("Unsupported compression: " + name) - return 0 - - -cdef ParquetCompression compression_from_name(str name): - name = name.upper() - if name == 'SNAPPY': - return ParquetCompression_SNAPPY - elif name == 'GZIP': - return ParquetCompression_GZIP - elif name == 'LZO': - return ParquetCompression_LZO - elif name == 'BROTLI': - return ParquetCompression_BROTLI - elif name == 'LZ4': - return ParquetCompression_LZ4 - elif name == 'ZSTD': - return ParquetCompression_ZSTD - else: - return ParquetCompression_UNCOMPRESSED - - cdef class ParquetWriter: cdef: unique_ptr[FileWriter] writer diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index cdc52cf48162..a154cac63299 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -516,6 +516,9 @@ def make_sample_file(table_or_df): def test_parquet_metadata_api(): + import pyarrow.parquet as pq + import pyarrow._parquet as _pq + df = alltypes_sample(size=10000) df = df.reindex(columns=sorted(df.columns)) @@ -558,14 +561,35 @@ def test_parquet_metadata_api(): # Row group for rg in range(meta.num_row_groups): rg_meta = meta.row_group(rg) + assert isinstance(rg_meta, _pq.RowGroupMetaData) repr(rg_meta) for col in range(rg_meta.num_columns): col_meta = rg_meta.column(col) + assert isinstance(col_meta, _pq.ColumnChunkMetaData) repr(col_meta) + rg_meta = meta.row_group(0) assert rg_meta.num_rows == len(df) assert rg_meta.num_columns == ncols + 1 # +1 for index + assert rg_meta.total_byte_size > 0 + + col_meta = rg_meta.column(0) + assert col_meta.file_offset > 0 + assert col_meta.file_path == '' # created from BytesIO + assert col_meta.physical_type == 'BOOLEAN' + assert col_meta.num_values == 10000 + assert col_meta.path_in_schema == 'bool' + assert col_meta.is_stats_set is True + assert isinstance(col_meta.statistics, _pq.RowGroupStatistics) + assert col_meta.compression == 'SNAPPY' + assert col_meta.encodings == ('PLAIN', 'RLE') + assert col_meta.has_dictionary_page is False + assert col_meta.dictionary_page_offset == 0 + assert col_meta.data_page_offset > 0 + assert col_meta.index_page_offset == 0 + assert col_meta.total_compressed_size > 0 + assert col_meta.total_uncompressed_size > 0 @pytest.mark.parametrize( From 6942eba11f0837ed142142055400f6c840f7a447 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 25 Jul 2018 11:19:42 +0200 Subject: [PATCH 10/16] comments in compare schemas --- python/pyarrow/tests/test_parquet.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index a154cac63299..9d94ef551d0d 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -654,12 +654,17 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value, def test_compare_schemas(): + import pyarrow.parquet as pq + import pyarrow._parquet as _pq + df = alltypes_sample(size=10000) fileh = make_sample_file(df) fileh2 = make_sample_file(df) fileh3 = make_sample_file(df[df.columns[::2]]) + # ParquetSchema + assert isinstance(fileh.schema, pq.ParquetSchema) assert fileh.schema.equals(fileh.schema) assert fileh.schema == fileh.schema assert fileh.schema.equals(fileh2.schema) @@ -668,6 +673,8 @@ def test_compare_schemas(): assert not fileh.schema.equals(fileh3.schema) assert fileh.schema != fileh3.schema + # ColumnSchema + assert isinstance(fileh.schema[0], _pq.ColumnSchema) assert fileh.schema[0].equals(fileh.schema[0]) assert fileh.schema[0] == fileh.schema[0] assert not fileh.schema[0].equals(fileh.schema[1]) From 40ba9651ea63482e285ed0e2b8a0317974acacf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 26 Jul 2018 12:52:44 +0200 Subject: [PATCH 11/16] expose missing parquet classes to pq namespace --- python/pyarrow/parquet.py | 7 ++++--- python/pyarrow/tests/test_parquet.py | 10 ++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 9c92737bf4b5..90ecdac0101a 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -33,9 +33,10 @@ import numpy as np from pyarrow.filesystem import FileSystem, LocalFileSystem, S3FSWrapper -from pyarrow._parquet import (ParquetReader, FileMetaData, # noqa - RowGroupMetaData, ParquetSchema) -import pyarrow._parquet as _parquet # noqa +from pyarrow._parquet import (ParquetReader, FileMetaData, RowGroupMetaData, + ColumnChunkMetaData, ParquetSchema, ColumnSchema, + RowGroupStatistics) # noqa +import pyarrow._parquet as _parquet import pyarrow.lib as lib import pyarrow as pa diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 9d94ef551d0d..19bec268609e 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -517,7 +517,6 @@ def make_sample_file(table_or_df): def test_parquet_metadata_api(): import pyarrow.parquet as pq - import pyarrow._parquet as _pq df = alltypes_sample(size=10000) df = df.reindex(columns=sorted(df.columns)) @@ -561,12 +560,12 @@ def test_parquet_metadata_api(): # Row group for rg in range(meta.num_row_groups): rg_meta = meta.row_group(rg) - assert isinstance(rg_meta, _pq.RowGroupMetaData) + assert isinstance(rg_meta, pq.RowGroupMetaData) repr(rg_meta) for col in range(rg_meta.num_columns): col_meta = rg_meta.column(col) - assert isinstance(col_meta, _pq.ColumnChunkMetaData) + assert isinstance(col_meta, pq.ColumnChunkMetaData) repr(col_meta) rg_meta = meta.row_group(0) @@ -581,7 +580,7 @@ def test_parquet_metadata_api(): assert col_meta.num_values == 10000 assert col_meta.path_in_schema == 'bool' assert col_meta.is_stats_set is True - assert isinstance(col_meta.statistics, _pq.RowGroupStatistics) + assert isinstance(col_meta.statistics, pq.RowGroupStatistics) assert col_meta.compression == 'SNAPPY' assert col_meta.encodings == ('PLAIN', 'RLE') assert col_meta.has_dictionary_page is False @@ -655,7 +654,6 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value, def test_compare_schemas(): import pyarrow.parquet as pq - import pyarrow._parquet as _pq df = alltypes_sample(size=10000) @@ -674,7 +672,7 @@ def test_compare_schemas(): assert fileh.schema != fileh3.schema # ColumnSchema - assert isinstance(fileh.schema[0], _pq.ColumnSchema) + assert isinstance(fileh.schema[0], pq.ColumnSchema) assert fileh.schema[0].equals(fileh.schema[0]) assert fileh.schema[0] == fileh.schema[0] assert not fileh.schema[0].equals(fileh.schema[1]) From 74d53bb9fafbb26ddba25ac31fab30e596885bd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 26 Jul 2018 14:14:57 +0200 Subject: [PATCH 12/16] missing distinct_count equals to zero --- python/pyarrow/_parquet.pyx | 5 ++++- python/pyarrow/tests/test_parquet.py | 28 +++++++++++++++------------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index cb9f17df472f..a31752066376 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -222,7 +222,10 @@ cdef class ColumnChunkMetaData: @property def dictionary_page_offset(self): - return self.metadata.dictionary_page_offset() + if self.has_dictionary_page: + return self.metadata.dictionary_page_offset() + else: + return None @property def data_page_offset(self): diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 19bec268609e..64254789c595 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -584,7 +584,7 @@ def test_parquet_metadata_api(): assert col_meta.compression == 'SNAPPY' assert col_meta.encodings == ('PLAIN', 'RLE') assert col_meta.has_dictionary_page is False - assert col_meta.dictionary_page_offset == 0 + assert col_meta.dictionary_page_offset is None assert col_meta.data_page_offset > 0 assert col_meta.index_page_offset == 0 assert col_meta.total_compressed_size > 0 @@ -603,29 +603,29 @@ def test_parquet_metadata_api(): 'distinct_count' ), [ - ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, None), - ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, None), - ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, None), - ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, None), - ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, None), - ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, None), - ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, None), - ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, None), + ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0), + ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0), + ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0), ( [-1.1, 2.2, 2.3, None, 4.4], pa.float32(), - 'FLOAT', -1.1, 4.4, 1, 4, None + 'FLOAT', -1.1, 4.4, 1, 4, 0 ), ( [-1.1, 2.2, 2.3, None, 4.4], pa.float64(), - 'DOUBLE', -1.1, 4.4, 1, 4, None + 'DOUBLE', -1.1, 4.4, 1, 4, 0 ), ( [u'', u'b', unichar(1000), None, u'aaa'], pa.binary(), - 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, None + 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0 ), ( [True, False, False, True, True], pa.bool_(), - 'BOOLEAN', False, True, 0, 5, None + 'BOOLEAN', False, True, 0, 5, 0 ), ] ) @@ -648,6 +648,8 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value, assert stat.max == max_value assert stat.null_count == null_count assert stat.num_values == num_values + # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount + # method, missing distinct_count is represented as zero instead of None assert stat.distinct_count == distinct_count assert stat.physical_type == physical_type From 71f5edee85885b037ef3b2ab0615ea5e4890e49f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 26 Jul 2018 14:47:55 +0200 Subject: [PATCH 13/16] flake8 --- python/pyarrow/parquet.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 90ecdac0101a..2c1aef0e88a3 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -33,9 +33,10 @@ import numpy as np from pyarrow.filesystem import FileSystem, LocalFileSystem, S3FSWrapper -from pyarrow._parquet import (ParquetReader, FileMetaData, RowGroupMetaData, - ColumnChunkMetaData, ParquetSchema, ColumnSchema, - RowGroupStatistics) # noqa +from pyarrow._parquet import (ParquetReader, RowGroupStatistics, # noqa + FileMetaData, RowGroupMetaData, + ColumnChunkMetaData, + ParquetSchema, ColumnSchema) import pyarrow._parquet as _parquet import pyarrow.lib as lib import pyarrow as pa From b1e7bede03038b4bc64ae120e530d5e3c749b2d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 26 Jul 2018 16:48:45 +0200 Subject: [PATCH 14/16] raise NotImplementedError for index_page_offset --- python/pyarrow/_parquet.pyx | 12 +++-- python/pyarrow/tests/test_parquet.py | 5 +- python/run_test.sh | 70 ++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 6 deletions(-) create mode 100755 python/run_test.sh diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index a31752066376..c7d0908836f8 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -156,9 +156,8 @@ cdef class ColumnChunkMetaData: has_dictionary_page: {10} dictionary_page_offset: {11} data_page_offset: {12} - index_page_offset: {13} - total_compressed_size: {14} - total_uncompressed_size: {15}""".format(object.__repr__(self), + total_compressed_size: {13} + total_uncompressed_size: {14}""".format(object.__repr__(self), self.file_offset, self.file_path, self.physical_type, @@ -171,7 +170,6 @@ cdef class ColumnChunkMetaData: self.has_dictionary_page, self.dictionary_page_offset, self.data_page_offset, - self.index_page_offset, self.total_compressed_size, self.total_uncompressed_size) @@ -231,9 +229,13 @@ cdef class ColumnChunkMetaData: def data_page_offset(self): return self.metadata.data_page_offset() + @property + def has_index_page(self): + raise NotImplementedError('not supported in parquet-cpp') + @property def index_page_offset(self): - return self.metadata.index_page_offset() + raise NotImplementedError("parquet-cpp doesn't return valid values") @property def total_compressed_size(self): diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 64254789c595..cc86ef16e08a 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -586,9 +586,12 @@ def test_parquet_metadata_api(): assert col_meta.has_dictionary_page is False assert col_meta.dictionary_page_offset is None assert col_meta.data_page_offset > 0 - assert col_meta.index_page_offset == 0 assert col_meta.total_compressed_size > 0 assert col_meta.total_uncompressed_size > 0 + with pytest.raises(NotImplementedError): + col_meta.has_index_page + with pytest.raises(NotImplementedError): + col_meta.index_page_offset @pytest.mark.parametrize( diff --git a/python/run_test.sh b/python/run_test.sh new file mode 100755 index 000000000000..49113f87683e --- /dev/null +++ b/python/run_test.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +export ARROW_BUILD_TYPE=debug +export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX +export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX +export ARROW_HOME=$CONDA_PREFIX +export PARQUET_HOME=$CONDA_PREFIX +# export PYARROW_CMAKE_GENERATOR=Ninja + +pushd ../cpp/build + +cmake -GNinja \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + -DARROW_ORC=off \ + -DARROW_PYTHON=ON \ + -DARROW_PLASMA=ON \ + -DARROW_BUILD_TESTS=ON \ + -DARROW_EXTRA_ERROR_CONTEXT=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \ + .. +ninja +ninja install + +popd + +pushd ../../parquet-cpp/build + +cmake -GNinja \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \ + -DPARQUET_BUILD_BENCHMARKS=OFF \ + -DPARQUET_BUILD_EXECUTABLES=ON \ + -DPARQUET_BUILD_TESTS=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \ + .. + +ninja +ninja install + + +popd + + +export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE +export PYARROW_WITH_PARQUET=1 +export PYARROW_WITH_PLASMA=1 +export PYARROW_WITH_ORC=0 + +python setup.py build_ext -q --inplace + +py.test -sv "$@" From 77c59d660c34b04cc83deeb963571c1234464c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 26 Jul 2018 16:52:22 +0200 Subject: [PATCH 15/16] return NotImplemented from equality check --- python/pyarrow/_parquet.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index c7d0908836f8..1aa21244114e 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -421,7 +421,7 @@ cdef class ParquetSchema: try: return self.equals(other) except TypeError: - return False + return NotImplemented def equals(self, ParquetSchema other): """ @@ -449,7 +449,7 @@ cdef class ColumnSchema: try: return self.equals(other) except TypeError: - return False + return NotImplemented def equals(self, ColumnSchema other): """ From d6a7f7794dbe5f179cb0e19a10bf66e465c79193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 26 Jul 2018 16:53:03 +0200 Subject: [PATCH 16/16] remove accidentally committed test helper script --- python/run_test.sh | 70 ---------------------------------------------- 1 file changed, 70 deletions(-) delete mode 100755 python/run_test.sh diff --git a/python/run_test.sh b/python/run_test.sh deleted file mode 100755 index 49113f87683e..000000000000 --- a/python/run_test.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -export ARROW_BUILD_TYPE=debug -export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX -export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX -export ARROW_HOME=$CONDA_PREFIX -export PARQUET_HOME=$CONDA_PREFIX -# export PYARROW_CMAKE_GENERATOR=Ninja - -pushd ../cpp/build - -cmake -GNinja \ - -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ - -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - -DARROW_ORC=off \ - -DARROW_PYTHON=ON \ - -DARROW_PLASMA=ON \ - -DARROW_BUILD_TESTS=ON \ - -DARROW_EXTRA_ERROR_CONTEXT=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \ - .. -ninja -ninja install - -popd - -pushd ../../parquet-cpp/build - -cmake -GNinja \ - -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ - -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \ - -DPARQUET_BUILD_BENCHMARKS=OFF \ - -DPARQUET_BUILD_EXECUTABLES=ON \ - -DPARQUET_BUILD_TESTS=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \ - .. - -ninja -ninja install - - -popd - - -export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE -export PYARROW_WITH_PARQUET=1 -export PYARROW_WITH_PLASMA=1 -export PYARROW_WITH_ORC=0 - -python setup.py build_ext -q --inplace - -py.test -sv "$@"