From c0e2a08cad194990fcdf665184615446f6c08a9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 28 Jun 2018 17:05:47 +0200
Subject: [PATCH 01/16] replace old property syntax; move indent to formatting;
 test missing column stat properties

---
 python/pyarrow/_parquet.pxd          |   2 +-
 python/pyarrow/_parquet.pyx          | 408 ++++++++++++---------------
 python/pyarrow/formatting.py         |  15 +-
 python/pyarrow/tests/test_parquet.py |  52 ++--
 4 files changed, 221 insertions(+), 256 deletions(-)

diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index ca20ce21b78c..7fdd66bb7ae8 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -41,7 +41,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil:
 
 
 cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
-    enum ParquetType" parquet::Type::type":
+    cpdef enum ParquetType" parquet::Type::type":
         ParquetType_BOOLEAN" parquet::Type::BOOLEAN"
         ParquetType_INT32" parquet::Type::INT32"
         ParquetType_INT64" parquet::Type::INT64"
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 983ff8d8a9a4..65deb730a05a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -31,16 +31,11 @@ from pyarrow.lib cimport (Array, Schema,
                           NativeFile, get_reader, get_writer)
 
 from pyarrow.compat import tobytes, frombytes
+from pyarrow.formatting import indent
 from pyarrow.lib import ArrowException, NativeFile, _stringify_path
 
 import six
-
-try:
-    from textwrap import indent
-except ImportError:
-    def indent(text, prefix):
-        lines = [prefix + line for line in text.splitlines(True)]
-        return ''.join(lines)
+import warnings
 
 
 cdef class RowGroupStatistics:
@@ -95,49 +90,42 @@ cdef class RowGroupStatistics:
         else:
             raise ValueError('Unknown physical ParquetType')
 
-    property has_min_max:
-
-        def __get__(self):
-            return self.statistics.get().HasMinMax()
-
-    property min:
-
-        def __get__(self):
-            raw_physical_type = self.statistics.get().physical_type()
-            encode_min = self.statistics.get().EncodeMin()
-
-            min_value = FormatStatValue(raw_physical_type, encode_min.c_str())
-            return self._cast_statistic(min_value)
-
-    property max:
-
-        def __get__(self):
-            raw_physical_type = self.statistics.get().physical_type()
-            encode_max = self.statistics.get().EncodeMax()
+    @property
+    def has_min_max(self):
+        return self.statistics.get().HasMinMax()
 
-            max_value = FormatStatValue(raw_physical_type, encode_max.c_str())
-            return self._cast_statistic(max_value)
+    @property
+    def min(self):
+        raw_physical_type = self.statistics.get().physical_type()
+        encode_min = self.statistics.get().EncodeMin()
 
-    property null_count:
+        min_value = FormatStatValue(raw_physical_type, encode_min.c_str())
+        return self._cast_statistic(min_value)
 
-        def __get__(self):
-            return self.statistics.get().null_count()
+    @property
+    def max(self):
+        raw_physical_type = self.statistics.get().physical_type()
+        encode_max = self.statistics.get().EncodeMax()
 
-    property distinct_count:
+        max_value = FormatStatValue(raw_physical_type, encode_max.c_str())
+        return self._cast_statistic(max_value)
 
-        def __get__(self):
-            return self.statistics.get().distinct_count()
-
-    property num_values:
+    @property
+    def null_count(self):
+        return self.statistics.get().null_count()
 
-        def __get__(self):
-            return self.statistics.get().num_values()
+    @property
+    def distinct_count(self):
+        return self.statistics.get().distinct_count()
 
-    property physical_type:
+    @property
+    def num_values(self):
+        return self.statistics.get().num_values()
 
-        def __get__(self):
-            physical_type = self.statistics.get().physical_type()
-            return physical_type_name_from_enum(physical_type)
+    @property
+    def physical_type(self):
+        raw_physical_type = self.statistics.get().physical_type()
+        return physical_type_name_from_enum(raw_physical_type)
 
 
 cdef class ColumnChunkMetaData:
@@ -187,86 +175,72 @@ cdef class ColumnChunkMetaData:
                                           self.total_compressed_size,
                                           self.total_uncompressed_size)
 
-    property file_offset:
-
-        def __get__(self):
-            return self.metadata.file_offset()
-
-    property file_path:
-
-        def __get__(self):
-            return frombytes(self.metadata.file_path())
-
-    property type:
-
-        def __get__(self):
-            return physical_type_name_from_enum(self.metadata.type())
-
-    property num_values:
-
-        def __get__(self):
-            return self.metadata.num_values()
-
-    property path_in_schema:
-
-        def __get__(self):
-            path = self.metadata.path_in_schema().get().ToDotString()
-            return frombytes(path)
-
-    property is_stats_set:
-
-        def __get__(self):
-            return self.metadata.is_stats_set()
-
-    property statistics:
-
-        def __get__(self):
-            if not self.metadata.is_stats_set():
-                return None
-            statistics = RowGroupStatistics()
-            statistics.init(self.metadata.statistics())
-            return statistics
-
-    property compression:
-
-        def __get__(self):
-            return self.metadata.compression()
+    @property
+    def file_offset(self):
+        return self.metadata.file_offset()
 
-    property encodings:
+    @property
+    def file_path(self):
+        return frombytes(self.metadata.file_path())
 
-        def __get__(self):
-            return map(encoding_name_from_enum,
-                       self.metadata.encodings())
+    @property
+    def type(self):
+        # XXX: shouldn't this be called pshysical_type like in
+        # RowGroupStatistics?
+        return physical_type_name_from_enum(self.metadata.type())
 
-    property has_dictionary_page:
+    @property
+    def num_values(self):
+        return self.metadata.num_values()
 
-        def __get__(self):
-            return self.metadata.has_dictionary_page()
+    @property
+    def path_in_schema(self):
+        path = self.metadata.path_in_schema().get().ToDotString()
+        return frombytes(path)
 
-    property dictionary_page_offset:
+    @property
+    def is_stats_set(self):
+        return self.metadata.is_stats_set()
 
-        def __get__(self):
-            return self.metadata.dictionary_page_offset()
+    @property
+    def statistics(self):
+        if not self.metadata.is_stats_set():
+            return None
+        statistics = RowGroupStatistics()
+        statistics.init(self.metadata.statistics())
+        return statistics
 
-    property data_page_offset:
+    @property
+    def compression(self):
+        return self.metadata.compression()
 
-        def __get__(self):
-            return self.metadata.data_page_offset()
+    @property
+    def encodings(self):
+        return map(encoding_name_from_enum, self.metadata.encodings())
 
-    property index_page_offset:
+    @property
+    def has_dictionary_page(self):
+        return self.metadata.has_dictionary_page()
 
-        def __get__(self):
-            return self.metadata.index_page_offset()
+    @property
+    def dictionary_page_offset(self):
+        return self.metadata.dictionary_page_offset()
 
-    property total_compressed_size:
+    @property
+    def data_page_offset(self):
+        return self.metadata.data_page_offset()
 
-        def __get__(self):
-            return self.metadata.total_compressed_size()
+    @property
+    def index_page_offset(self):
+        return self.metadata.index_page_offset()
 
-    property total_uncompressed_size:
+    @property
+    def total_compressed_size(self):
+        return self.metadata.total_compressed_size()
 
-        def __get__(self):
-            return self.metadata.total_uncompressed_size()
+    @property
+    def total_uncompressed_size(self):
+        return self.metadata.total_uncompressed_size()
 
 
 cdef class RowGroupMetaData:
@@ -299,20 +273,17 @@ cdef class RowGroupMetaData:
                                  self.num_rows,
                                  self.total_byte_size)
 
-    property num_columns:
-
-        def __get__(self):
-            return self.metadata.num_columns()
-
-    property num_rows:
-
-        def __get__(self):
-            return self.metadata.num_rows()
+    @property
+    def num_columns(self):
+        return self.metadata.num_columns()
 
-    property total_byte_size:
+    @property
+    def num_rows(self):
+        return self.metadata.num_rows()
 
-        def __get__(self):
-            return self.metadata.total_byte_size()
+    @property
+    def total_byte_size(self):
+        return self.metadata.total_byte_size()
 
 
 cdef class FileMetaData:
@@ -351,43 +322,37 @@ cdef class FileMetaData:
         self._schema = schema
         return schema
 
-    property serialized_size:
-
-        def __get__(self):
-            return self._metadata.size()
-
-    property num_columns:
-
-        def __get__(self):
-            return self._metadata.num_columns()
-
-    property num_rows:
-
-        def __get__(self):
-            return self._metadata.num_rows()
-
-    property num_row_groups:
+    @property
+    def serialized_size(self):
+        return self._metadata.size()
 
-        def __get__(self):
-            return self._metadata.num_row_groups()
+    @property
+    def num_columns(self):
+        return self._metadata.num_columns()
 
-    property format_version:
+    @property
+    def num_rows(self):
+        return self._metadata.num_rows()
 
-        def __get__(self):
-            cdef ParquetVersion version = self._metadata.version()
-            if version == ParquetVersion_V1:
-                return '1.0'
-            if version == ParquetVersion_V2:
-                return '2.0'
-            else:
-                print('Unrecognized file version, assuming 1.0: {0}'
-                      .format(version))
-                return '1.0'
+    @property
+    def num_row_groups(self):
+        return self._metadata.num_row_groups()
 
-    property created_by:
+    @property
+    def format_version(self):
+        cdef ParquetVersion version = self._metadata.version()
+        if version == ParquetVersion_V1:
+            return '1.0'
+        if version == ParquetVersion_V2:
+            return '2.0'
+        else:
+            # TODO(kszucs) warn instead of print
+            print('Unrecognized file version, assuming 1.0: {}'.format(version))
+            return '1.0'
 
-        def __get__(self):
-            return frombytes(self._metadata.created_by())
+    @property
+    def created_by(self):
+        return frombytes(self._metadata.created_by())
 
     def row_group(self, int i):
         """
@@ -444,10 +409,9 @@ cdef class ParquetSchema:
     def __getitem__(self, i):
         return self.column(i)
 
-    property names:
-
-        def __get__(self):
-            return [self[i].name for i in range(len(self))]
+    @property
+    def names(self):
+        return [self[i].name for i in range(len(self))]
 
     def to_arrow_schema(self):
         """
@@ -457,8 +421,7 @@ cdef class ParquetSchema:
         -------
         schema : pyarrow.Schema
         """
-        cdef:
-            shared_ptr[CSchema] sp_arrow_schema
+        cdef shared_ptr[CSchema] sp_arrow_schema
 
         with nogil:
             check_status(FromParquetSchema(
@@ -467,6 +430,8 @@ cdef class ParquetSchema:
 
         return pyarrow_wrap_schema(sp_arrow_schema)
 
+    # TODO(kszucs): impl __eq__
+
     def equals(self, ParquetSchema other):
         """
         Returns True if the Parquet schemas are equal
@@ -477,9 +442,7 @@ cdef class ParquetSchema:
         if i < 0 or i >= len(self):
             raise IndexError('{0} out of bounds'.format(i))
 
-        cdef ColumnSchema col = ColumnSchema()
-        col.init_from_schema(self, i)
-        return col
+        return ColumnSchema(self, i)
 
 
 cdef class ColumnSchema:
@@ -487,13 +450,12 @@ cdef class ColumnSchema:
         ParquetSchema parent
         const ColumnDescriptor* descr
 
-    def __cinit__(self):
-        self.descr = NULL
-
-    cdef init_from_schema(self, ParquetSchema schema, int i):
-        self.parent = schema
+    def __cinit__(self, ParquetSchema schema, int i):
+        self.parent = schema  # XXX why is it needed? ownership?
         self.descr = schema.schema.Column(i)
 
+    # TODO(kszucs) impl __eq__
+
     def equals(self, ColumnSchema other):
         """
         Returns True if the column schemas are equal
@@ -520,52 +482,43 @@ cdef class ColumnSchema:
                               self.max_repetition_level, physical_type,
                               logical_type)
 
-    property name:
-
-        def __get__(self):
-            return frombytes(self.descr.name())
-
-    property path:
-
-        def __get__(self):
-            return frombytes(self.descr.path().get().ToDotString())
-
-    property max_definition_level:
-
-        def __get__(self):
-            return self.descr.max_definition_level()
-
-    property max_repetition_level:
+    @property
+    def name(self):
+        return frombytes(self.descr.name())
 
-        def __get__(self):
-            return self.descr.max_repetition_level()
+    @property
+    def path(self):
+        return frombytes(self.descr.path().get().ToDotString())
 
-    property physical_type:
+    @property
+    def max_definition_level(self):
+        return self.descr.max_definition_level()
 
-        def __get__(self):
-            return physical_type_name_from_enum(self.descr.physical_type())
+    @property
+    def max_repetition_level(self):
+        return self.descr.max_repetition_level()
 
-    property logical_type:
+    @property
+    def physical_type(self):
+        return physical_type_name_from_enum(self.descr.physical_type())
 
-        def __get__(self):
-            return logical_type_name_from_enum(self.descr.logical_type())
+    @property
+    def logical_type(self):
+        return logical_type_name_from_enum(self.descr.logical_type())
 
     # FIXED_LEN_BYTE_ARRAY attribute
-    property length:
-
-        def __get__(self):
-            return self.descr.type_length()
+    @property
+    def length(self):
+        return self.descr.type_length()
 
     # Decimal attributes
-    property precision:
-
-        def __get__(self):
-            return self.descr.type_precision()
-
-    property scale:
+    @property
+    def precision(self):
+        return self.descr.type_precision()
 
-        def __get__(self):
-            return self.descr.type_scale()
+    @property
+    def scale(self):
+        return self.descr.type_scale()
 
 
 cdef physical_type_name_from_enum(ParquetType type_):
@@ -654,22 +607,21 @@ cdef class ParquetReader:
             check_status(OpenFile(rd_handle, self.allocator, properties,
                                   c_metadata, &self.reader))
 
-    property column_paths:
-
-        def __get__(self):
-            cdef:
-                FileMetaData container = self.metadata
-                const CFileMetaData* metadata = container._metadata
-                vector[c_string] path
-                int i = 0
+    @property
+    def column_paths(self):
+        cdef:
+            FileMetaData container = self.metadata
+            const CFileMetaData* metadata = container._metadata
+            vector[c_string] path
+            int i = 0
 
-            paths = []
-            for i in range(0, metadata.num_columns()):
-                path = (metadata.schema().Column(i)
-                        .path().get().ToDotVector())
-                paths.append([frombytes(x) for x in path])
+        paths = []
+        for i in range(0, metadata.num_columns()):
+            path = (metadata.schema().Column(i)
+                    .path().get().ToDotVector())
+            paths.append([frombytes(x) for x in path])
 
-            return paths
+        return paths
 
     @property
     def metadata(self):
@@ -686,10 +638,9 @@ cdef class ParquetReader:
         result.init(metadata)
         return result
 
-    property num_row_groups:
-
-        def __get__(self):
-            return self.reader.get().num_row_groups()
+    @property
+    def num_row_groups(self):
+        return self.reader.get().num_row_groups()
 
     def set_num_threads(self, int nthreads):
         self.reader.get().set_num_threads(nthreads)
@@ -809,26 +760,27 @@ cdef class ParquetReader:
         array.init(carray)
         return array
 
+
 cdef int check_compression_name(name) except -1:
-    if name.upper() not in ['NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4',
-                            'ZSTD']:
+    if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4',
+                            'ZSTD'}:
         raise ArrowException("Unsupported compression: " + name)
     return 0
 
 
 cdef ParquetCompression compression_from_name(str name):
     name = name.upper()
-    if name == "SNAPPY":
+    if name == 'SNAPPY':
         return ParquetCompression_SNAPPY
-    elif name == "GZIP":
+    elif name == 'GZIP':
         return ParquetCompression_GZIP
-    elif name == "LZO":
+    elif name == 'LZO':
         return ParquetCompression_LZO
-    elif name == "BROTLI":
+    elif name == 'BROTLI':
         return ParquetCompression_BROTLI
-    elif name == "LZ4":
+    elif name == 'LZ4':
         return ParquetCompression_LZ4
-    elif name == "ZSTD":
+    elif name == 'ZSTD':
         return ParquetCompression_ZSTD
     else:
         return ParquetCompression_UNCOMPRESSED
diff --git a/python/pyarrow/formatting.py b/python/pyarrow/formatting.py
index eea3e74d881f..5ef9482ed144 100644
--- a/python/pyarrow/formatting.py
+++ b/python/pyarrow/formatting.py
@@ -20,6 +20,12 @@
 import pyarrow.lib as lib
 import warnings
 
+try:
+    from textwrap import indent
+except ImportError:
+    def indent(text, prefix):
+        return ''.join(prefix + line for line in text.splitlines(True))
+
 
 def array_format(arr, window=10):
     warnings.warn("array_format is deprecated, use Array.format() instead",
@@ -32,13 +38,6 @@ def value_format(x, indent_level=0):
                   FutureWarning)
     if isinstance(x, lib.ListValue):
         contents = ',\n'.join(value_format(item) for item in x)
-        return '[{0}]'.format(_indent(contents, 1).strip())
+        return '[{0}]'.format(indent(contents, ' ').strip())
     else:
         return repr(x)
-
-
-def _indent(text, spaces):
-    if spaces == 0:
-        return text
-    block = ' ' * spaces
-    return '\n'.join(block + x for x in text.split('\n'))
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index c9c1a96db471..b77ad34ae0a8 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -564,31 +564,42 @@ def test_parquet_metadata_api():
 
 
 @pytest.mark.parametrize(
-    'data, dtype, min_value, max_value, null_count, num_values',
+    (
+        'data',
+        'dtype',
+        'physical_type',
+        'min_value',
+        'max_value',
+        'null_count',
+        'num_values',
+        'distinct_count'
+    ),
     [
-        ([1, 2, 2, None, 4], np.uint8, 1, 4, 1, 4),
-        ([1, 2, 2, None, 4], np.uint16, 1, 4, 1, 4),
-        ([1, 2, 2, None, 4], np.uint32, 1, 4, 1, 4),
-        ([1, 2, 2, None, 4], np.uint64, 1, 4, 1, 4),
-        ([-1, 2, 2, None, 4], np.int16, -1, 4, 1, 4),
-        ([-1, 2, 2, None, 4], np.int32, -1, 4, 1, 4),
-        ([-1, 2, 2, None, 4], np.int64, -1, 4, 1, 4),
-        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, -1.1, 4.4, 1, 4),
-        ([-1.1, 2.2, 2.3, None, 4.4], np.float64, -1.1, 4.4, 1, 4),
+        ([1, 2, 2, None, 4], np.uint8, 'INT64', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], np.uint16, 'INT64', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], np.uint32, 'INT64', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], np.uint64, 'INT64', 1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], np.int16, 'INT64', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], np.int32, 'INT64', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], np.int64, 'INT64', -1, 4, 1, 4, 0),
+        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, 'FLOAT', -1.1, 4.4, 1, 4, 0),
+        (
+            [-1.1, 2.2, 2.3, None, 4.4],
+            np.float64, 'DOUBLE', -1.1, 4.4, 1, 4, 0
+        ),
         (
             [u'', u'b', unichar(1000), None, u'aaa'],
-            object, b'', unichar(1000).encode('utf-8'), 1, 4
+            object, 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0
+        ),
+        (
+            [True, False, False, True, True],
+            np.bool, 'BOOLEAN', False, True, 0, 5, 0
         ),
-        ([True, False, False, True, True], np.bool, False, True, 0, 5),
     ]
 )
-def test_parquet_column_statistics_api(
-        data,
-        dtype,
-        min_value,
-        max_value,
-        null_count,
-        num_values):
+def test_parquet_column_statistics_api(data, dtype, physical_type, min_value,
+                                       max_value, null_count, num_values,
+                                       distinct_count):
     df = pd.DataFrame({'data': data}, dtype=dtype)
 
     fileh = make_sample_file(df)
@@ -599,10 +610,13 @@ def test_parquet_column_statistics_api(
     col_meta = rg_meta.column(0)
 
     stat = col_meta.statistics
+    assert stat.has_min_max
     assert stat.min == min_value
     assert stat.max == max_value
     assert stat.null_count == null_count
     assert stat.num_values == num_values
+    assert stat.distinct_count == distinct_count
+    assert stat.physical_type == physical_type
 
 
 def test_compare_schemas():

From 06e4f8ed370ddf7db49dfac7d978e0927306d791 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 28 Jun 2018 17:07:49 +0200
Subject: [PATCH 02/16] remove enum cpdef

---
 python/pyarrow/_parquet.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 7fdd66bb7ae8..ca20ce21b78c 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -41,7 +41,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil:
 
 
 cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
-    cpdef enum ParquetType" parquet::Type::type":
+    enum ParquetType" parquet::Type::type":
         ParquetType_BOOLEAN" parquet::Type::BOOLEAN"
         ParquetType_INT32" parquet::Type::INT32"
         ParquetType_INT64" parquet::Type::INT64"

From c58441a45c9c146c4786f7ae7bed0d0259342d3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 28 Jun 2018 17:21:39 +0200
Subject: [PATCH 03/16] due to receiving extension class as argument it's
 possuble to use directly __cinit__ constructors instead of custom init
 methods

---
 python/pyarrow/_parquet.pyx | 56 +++++++++++++------------------------
 1 file changed, 20 insertions(+), 36 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 65deb730a05a..773dd541d9c6 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -249,10 +249,7 @@ cdef class RowGroupMetaData:
         CRowGroupMetaData* metadata
         FileMetaData parent
 
-    def __cinit__(self):
-        pass
-
-    cdef void init_from_file(self, FileMetaData parent, int i):
+    def __cinit__(self, FileMetaData parent, int i):
         if i < 0 or i >= parent.num_row_groups:
             raise IndexError('{0} out of bounds'.format(i))
         self.up_metadata = parent._metadata.RowGroup(i)
@@ -314,13 +311,9 @@ cdef class FileMetaData:
 
     @property
     def schema(self):
-        if self._schema is not None:
-            return self._schema
-
-        cdef ParquetSchema schema = ParquetSchema()
-        schema.init_from_filemeta(self)
-        self._schema = schema
-        return schema
+        if self._schema is None:
+            self._schema = ParquetSchema(self)
+        return self._schema
 
     @property
     def serialized_size(self):
@@ -354,26 +347,20 @@ cdef class FileMetaData:
     def created_by(self):
         return frombytes(self._metadata.created_by())
 
-    def row_group(self, int i):
-        """
-
-        """
-        cdef RowGroupMetaData result = RowGroupMetaData()
-        result.init_from_file(self, i)
-        return result
-
-    property metadata:
+    @property
+    def metadata(self):
+        cdef:
+            unordered_map[c_string, c_string] metadata
+            const CKeyValueMetadata* underlying_metadata
+        underlying_metadata = self._metadata.key_value_metadata().get()
+        if underlying_metadata != NULL:
+            underlying_metadata.ToUnorderedMap(&metadata)
+            return metadata
+        else:
+            return None
 
-        def __get__(self):
-            cdef:
-                unordered_map[c_string, c_string] metadata
-                const CKeyValueMetadata* underlying_metadata
-            underlying_metadata = self._metadata.key_value_metadata().get()
-            if underlying_metadata != NULL:
-                underlying_metadata.ToUnorderedMap(&metadata)
-                return metadata
-            else:
-                return None
+    def row_group(self, int i):
+        return RowGroupMetaData(self, i)
 
 
 cdef class ParquetSchema:
@@ -381,8 +368,9 @@ cdef class ParquetSchema:
         FileMetaData parent  # the FileMetaData owning the SchemaDescriptor
         const SchemaDescriptor* schema
 
-    def __cinit__(self):
-        self.schema = NULL
+    def __cinit__(self, FileMetaData container):
+        self.parent = container
+        self.schema = container._metadata.schema()
 
     def __repr__(self):
         cdef const ColumnDescriptor* descr
@@ -399,10 +387,6 @@ cdef class ParquetSchema:
 {1}
  """.format(object.__repr__(self), '\n'.join(elements))
 
-    cdef init_from_filemeta(self, FileMetaData container):
-        self.parent = container
-        self.schema = container._metadata.schema()
-
     def __len__(self):
         return self.schema.num_columns()
 

From 266a022b0f7f5b425edc1530112d6ce3e4a287b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 28 Jun 2018 18:20:27 +0200
Subject: [PATCH 04/16] implement equality operator on ParquetSchema and
 ColumnSchema

---
 python/pyarrow/_parquet.pyx          | 12 ++++++++++--
 python/pyarrow/tests/test_parquet.py | 10 +++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 773dd541d9c6..1f60014bd728 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -414,7 +414,11 @@ cdef class ParquetSchema:
 
         return pyarrow_wrap_schema(sp_arrow_schema)
 
-    # TODO(kszucs): impl __eq__
+    def __eq__(self, other):
+        try:
+            return self.equals(other)
+        except TypeError:
+            return False
 
     def equals(self, ParquetSchema other):
         """
@@ -438,7 +442,11 @@ cdef class ColumnSchema:
         self.parent = schema  # XXX why is it needed? ownership?
         self.descr = schema.schema.Column(i)
 
-    # TODO(kszucs) impl __eq__
+    def __eq__(self, other):
+        try:
+            return self.equals(other)
+        except TypeError:
+            return False
 
     def equals(self, ColumnSchema other):
         """
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index b77ad34ae0a8..8b2e11bc7536 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -527,6 +527,8 @@ def test_parquet_metadata_api():
     assert meta.num_row_groups == 1
     assert meta.format_version == '2.0'
     assert 'parquet-cpp' in meta.created_by
+    assert isinstance(meta.serialized_size, int)
+    assert isinstance(meta.metadata, dict)
 
     # Schema
     schema = fileh.schema
@@ -627,12 +629,18 @@ def test_compare_schemas():
     fileh3 = make_sample_file(df[df.columns[::2]])
 
     assert fileh.schema.equals(fileh.schema)
+    assert fileh.schema == fileh.schema
     assert fileh.schema.equals(fileh2.schema)
-
+    assert fileh.schema == fileh2.schema
+    assert fileh.schema != 'arbitrary object'
     assert not fileh.schema.equals(fileh3.schema)
+    assert fileh.schema != fileh3.schema
 
     assert fileh.schema[0].equals(fileh.schema[0])
+    assert fileh.schema[0] == fileh.schema[0]
     assert not fileh.schema[0].equals(fileh.schema[1])
+    assert fileh.schema[0] != fileh.schema[1]
+    assert fileh.schema[0] != 'arbitrary object'
 
 
 def test_column_of_arrays(tmpdir):

From a43153e960288d67ba44d127fae1f25972f11534 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 28 Jun 2018 19:28:32 +0200
Subject: [PATCH 05/16] warn instead of print

---
 python/pyarrow/_parquet.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 1f60014bd728..369023cc46a0 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -339,8 +339,8 @@ cdef class FileMetaData:
         if version == ParquetVersion_V2:
             return '2.0'
         else:
-            # TODO(kszucs) warn instead of print
-            print('Unrecognized file version, assuming 1.0: {}'.format(version))
+            warnings.warn('Unrecognized file version, assuming 1.0: {}'
+                          .format(version))
             return '1.0'
 
     @property

From dd362295a80c0542be23c34c287051f036547dbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Fri, 29 Jun 2018 08:44:46 +0200
Subject: [PATCH 06/16] rename type property to physical_type

---
 python/pyarrow/_parquet.pyx | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 369023cc46a0..cebcd0d3bffe 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -184,9 +184,7 @@ cdef class ColumnChunkMetaData:
         return frombytes(self.metadata.file_path())
 
     @property
-    def type(self):
-        # XXX: shouldn't this be called pshysical_type like in
-        # RowGroupStatistics?
+    def physical_type(self):
         return physical_type_name_from_enum(self.metadata.type())
 
     @property

From d55b96a8131bc51b17525dac8a9364d0a9899ee1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Wed, 4 Jul 2018 12:29:29 +0200
Subject: [PATCH 07/16] fix types

---
 python/pyarrow/_parquet.pyx          |  4 +--
 python/pyarrow/tests/test_parquet.py | 50 ++++++++++++++++------------
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index cebcd0d3bffe..530f3987066c 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -145,7 +145,7 @@ cdef class ColumnChunkMetaData:
         return """{0}
   file_offset: {1}
   file_path: {2}
-  type: {3}
+  physical_type: {3}
   num_values: {4}
   path_in_schema: {5}
   is_stats_set: {6}
@@ -161,7 +161,7 @@ cdef class ColumnChunkMetaData:
   total_uncompressed_size: {15}""".format(object.__repr__(self),
                                           self.file_offset,
                                           self.file_path,
-                                          self.type,
+                                          self.physical_type,
                                           self.num_values,
                                           self.path_in_schema,
                                           self.is_stats_set,
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 8b2e11bc7536..f7ad30771eb6 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -499,10 +499,13 @@ def test_pandas_parquet_configuration_options(tmpdir):
         tm.assert_frame_equal(df, df_read)
 
 
-def make_sample_file(df):
+def make_sample_file(table_or_df):
     import pyarrow.parquet as pq
 
-    a_table = pa.Table.from_pandas(df)
+    if isinstance(table_or_df, pa.Table):
+        a_table = table_or_df
+    else:
+        a_table = pa.Table.from_pandas(table_or_df)
 
     buf = io.BytesIO()
     _write_table(a_table, buf, compression='SNAPPY', version='2.0',
@@ -568,7 +571,7 @@ def test_parquet_metadata_api():
 @pytest.mark.parametrize(
     (
         'data',
-        'dtype',
+        'type',
         'physical_type',
         'min_value',
         'max_value',
@@ -577,34 +580,39 @@ def test_parquet_metadata_api():
         'distinct_count'
     ),
     [
-        ([1, 2, 2, None, 4], np.uint8, 'INT64', 1, 4, 1, 4, 0),
-        ([1, 2, 2, None, 4], np.uint16, 'INT64', 1, 4, 1, 4, 0),
-        ([1, 2, 2, None, 4], np.uint32, 'INT64', 1, 4, 1, 4, 0),
-        ([1, 2, 2, None, 4], np.uint64, 'INT64', 1, 4, 1, 4, 0),
-        ([-1, 2, 2, None, 4], np.int16, 'INT64', -1, 4, 1, 4, 0),
-        ([-1, 2, 2, None, 4], np.int32, 'INT64', -1, 4, 1, 4, 0),
-        ([-1, 2, 2, None, 4], np.int64, 'INT64', -1, 4, 1, 4, 0),
-        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, 'FLOAT', -1.1, 4.4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
+        (
+            [-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
+            'FLOAT', -1.1, 4.4, 1, 4, 0
+        ),
         (
-            [-1.1, 2.2, 2.3, None, 4.4],
-            np.float64, 'DOUBLE', -1.1, 4.4, 1, 4, 0
+            [-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
+            'DOUBLE', -1.1, 4.4, 1, 4, 0
         ),
         (
-            [u'', u'b', unichar(1000), None, u'aaa'],
-            object, 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0
+            [u'', u'b', unichar(1000), None, u'aaa'], pa.binary(),
+            'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0
         ),
         (
-            [True, False, False, True, True],
-            np.bool, 'BOOLEAN', False, True, 0, 5, 0
+            [True, False, False, True, True], pa.bool_(),
+            'BOOLEAN', False, True, 0, 5, 0
         ),
     ]
 )
-def test_parquet_column_statistics_api(data, dtype, physical_type, min_value,
+def test_parquet_column_statistics_api(data, type, physical_type, min_value,
                                        max_value, null_count, num_values,
                                        distinct_count):
-    df = pd.DataFrame({'data': data}, dtype=dtype)
-
-    fileh = make_sample_file(df)
+    df = pd.DataFrame({'data': data})
+    schema = pa.schema([pa.field('data', type)])
+    table = pa.Table.from_pandas(df, schema=schema)
+    fileh = make_sample_file(table)
 
     meta = fileh.metadata
 

From ffa104136cd845b54dce233f26869610be427a56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Wed, 4 Jul 2018 13:18:55 +0200
Subject: [PATCH 08/16] expected distinct_count value is None

---
 python/pyarrow/_parquet.pyx          |  2 +-
 python/pyarrow/tests/test_parquet.py | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 530f3987066c..722d2e0226cb 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -437,7 +437,7 @@ cdef class ColumnSchema:
         const ColumnDescriptor* descr
 
     def __cinit__(self, ParquetSchema schema, int i):
-        self.parent = schema  # XXX why is it needed? ownership?
+        self.parent = schema
         self.descr = schema.schema.Column(i)
 
     def __eq__(self, other):
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index f7ad30771eb6..cdc52cf48162 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -580,29 +580,29 @@ def test_parquet_metadata_api():
         'distinct_count'
     ),
     [
-        ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
-        ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
-        ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
-        ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
-        ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
-        ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
-        ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
-        ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, None),
+        ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, None),
+        ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, None),
+        ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, None),
+        ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, None),
+        ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, None),
+        ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, None),
+        ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, None),
         (
             [-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
-            'FLOAT', -1.1, 4.4, 1, 4, 0
+            'FLOAT', -1.1, 4.4, 1, 4, None
         ),
         (
             [-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
-            'DOUBLE', -1.1, 4.4, 1, 4, 0
+            'DOUBLE', -1.1, 4.4, 1, 4, None
         ),
         (
             [u'', u'b', unichar(1000), None, u'aaa'], pa.binary(),
-            'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0
+            'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, None
         ),
         (
             [True, False, False, True, True], pa.bool_(),
-            'BOOLEAN', False, True, 0, 5, 0
+            'BOOLEAN', False, True, 0, 5, None
         ),
     ]
 )

From 1e1c7cd60ede886ece7d1b200c23a05b6988f77e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Wed, 25 Jul 2018 11:12:21 +0200
Subject: [PATCH 09/16] test and fix column chunk metadata properties

---
 python/pyarrow/_parquet.pyx          | 86 ++++++++++++++++------------
 python/pyarrow/tests/test_parquet.py | 24 ++++++++
 2 files changed, 73 insertions(+), 37 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 722d2e0226cb..cb9f17df472f 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -210,15 +210,15 @@ cdef class ColumnChunkMetaData:
 
     @property
     def compression(self):
-        return self.metadata.compression()
+        return compression_name_from_enum(self.metadata.compression())
 
     @property
     def encodings(self):
-        return map(encoding_name_from_enum, self.metadata.encodings())
+        return tuple(map(encoding_name_from_enum, self.metadata.encodings()))
 
     @property
     def has_dictionary_page(self):
-        return self.metadata.has_dictionary_page()
+        return bool(self.metadata.has_dictionary_page())
 
     @property
     def dictionary_page_offset(self):
@@ -552,19 +552,56 @@ cdef logical_type_name_from_enum(ParquetLogicalType type_):
     }.get(type_, 'UNKNOWN')
 
 
-cdef encoding_name_from_enum (ParquetEncoding encoding_):
+cdef encoding_name_from_enum(ParquetEncoding encoding_):
     return {
-        ParquetEncoding_PLAIN: "PLAIN",
-        ParquetEncoding_PLAIN_DICTIONARY: "PLAIN_DICTIONARY",
-        ParquetEncoding_RLE: "RLE",
-        ParquetEncoding_BIT_PACKED: "BIT_PACKED",
-        ParquetEncoding_DELTA_BINARY_PACKED: "DELTA_BINARY_PACKED",
-        ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: "DELTA_LENGTH_BYTE_ARRAY",
-        ParquetEncoding_DELTA_BYTE_ARRAY: "DELTA_BYTE_ARRAY",
-        ParquetEncoding_RLE_DICTIONARY: "RLE_DICTIONARY",
+        ParquetEncoding_PLAIN: 'PLAIN',
+        ParquetEncoding_PLAIN_DICTIONARY: 'PLAIN_DICTIONARY',
+        ParquetEncoding_RLE: 'RLE',
+        ParquetEncoding_BIT_PACKED: 'BIT_PACKED',
+        ParquetEncoding_DELTA_BINARY_PACKED: 'DELTA_BINARY_PACKED',
+        ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: 'DELTA_LENGTH_BYTE_ARRAY',
+        ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY',
+        ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY',
     }.get(encoding_, 'UNKNOWN')
 
 
+cdef compression_name_from_enum(ParquetCompression compression_):
+    return {
+        ParquetCompression_UNCOMPRESSED: 'UNCOMPRESSED',
+        ParquetCompression_SNAPPY: 'SNAPPY',
+        ParquetCompression_GZIP: 'GZIP',
+        ParquetCompression_LZO: 'LZO',
+        ParquetCompression_BROTLI: 'BROTLI',
+        ParquetCompression_LZ4: 'LZ4',
+        ParquetCompression_ZSTD: 'ZSTD',
+    }.get(compression_, 'UNKNOWN')
+
+
+cdef int check_compression_name(name) except -1:
+    if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4',
+                            'ZSTD'}:
+        raise ArrowException("Unsupported compression: " + name)
+    return 0
+
+
+cdef ParquetCompression compression_from_name(str name):
+    name = name.upper()
+    if name == 'SNAPPY':
+        return ParquetCompression_SNAPPY
+    elif name == 'GZIP':
+        return ParquetCompression_GZIP
+    elif name == 'LZO':
+        return ParquetCompression_LZO
+    elif name == 'BROTLI':
+        return ParquetCompression_BROTLI
+    elif name == 'LZ4':
+        return ParquetCompression_LZ4
+    elif name == 'ZSTD':
+        return ParquetCompression_ZSTD
+    else:
+        return ParquetCompression_UNCOMPRESSED
+
+
 cdef class ParquetReader:
     cdef:
         object source
@@ -751,31 +788,6 @@ cdef class ParquetReader:
         return array
 
 
-cdef int check_compression_name(name) except -1:
-    if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4',
-                            'ZSTD'}:
-        raise ArrowException("Unsupported compression: " + name)
-    return 0
-
-
-cdef ParquetCompression compression_from_name(str name):
-    name = name.upper()
-    if name == 'SNAPPY':
-        return ParquetCompression_SNAPPY
-    elif name == 'GZIP':
-        return ParquetCompression_GZIP
-    elif name == 'LZO':
-        return ParquetCompression_LZO
-    elif name == 'BROTLI':
-        return ParquetCompression_BROTLI
-    elif name == 'LZ4':
-        return ParquetCompression_LZ4
-    elif name == 'ZSTD':
-        return ParquetCompression_ZSTD
-    else:
-        return ParquetCompression_UNCOMPRESSED
-
-
 cdef class ParquetWriter:
     cdef:
         unique_ptr[FileWriter] writer
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index cdc52cf48162..a154cac63299 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -516,6 +516,9 @@ def make_sample_file(table_or_df):
 
 
 def test_parquet_metadata_api():
+    import pyarrow.parquet as pq
+    import pyarrow._parquet as _pq
+
     df = alltypes_sample(size=10000)
     df = df.reindex(columns=sorted(df.columns))
 
@@ -558,14 +561,35 @@ def test_parquet_metadata_api():
     # Row group
     for rg in range(meta.num_row_groups):
         rg_meta = meta.row_group(rg)
+        assert isinstance(rg_meta, _pq.RowGroupMetaData)
         repr(rg_meta)
 
         for col in range(rg_meta.num_columns):
             col_meta = rg_meta.column(col)
+            assert isinstance(col_meta, _pq.ColumnChunkMetaData)
             repr(col_meta)
 
+    rg_meta = meta.row_group(0)
     assert rg_meta.num_rows == len(df)
     assert rg_meta.num_columns == ncols + 1  # +1 for index
+    assert rg_meta.total_byte_size > 0
+
+    col_meta = rg_meta.column(0)
+    assert col_meta.file_offset > 0
+    assert col_meta.file_path == ''  # created from BytesIO
+    assert col_meta.physical_type == 'BOOLEAN'
+    assert col_meta.num_values == 10000
+    assert col_meta.path_in_schema == 'bool'
+    assert col_meta.is_stats_set is True
+    assert isinstance(col_meta.statistics, _pq.RowGroupStatistics)
+    assert col_meta.compression == 'SNAPPY'
+    assert col_meta.encodings == ('PLAIN', 'RLE')
+    assert col_meta.has_dictionary_page is False
+    assert col_meta.dictionary_page_offset == 0
+    assert col_meta.data_page_offset > 0
+    assert col_meta.index_page_offset == 0
+    assert col_meta.total_compressed_size > 0
+    assert col_meta.total_uncompressed_size > 0
 
 
 @pytest.mark.parametrize(

From 6942eba11f0837ed142142055400f6c840f7a447 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Wed, 25 Jul 2018 11:19:42 +0200
Subject: [PATCH 10/16] comments in compare schemas

---
 python/pyarrow/tests/test_parquet.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index a154cac63299..9d94ef551d0d 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -654,12 +654,17 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value,
 
 
 def test_compare_schemas():
+    import pyarrow.parquet as pq
+    import pyarrow._parquet as _pq
+
     df = alltypes_sample(size=10000)
 
     fileh = make_sample_file(df)
     fileh2 = make_sample_file(df)
     fileh3 = make_sample_file(df[df.columns[::2]])
 
+    # ParquetSchema
+    assert isinstance(fileh.schema, pq.ParquetSchema)
     assert fileh.schema.equals(fileh.schema)
     assert fileh.schema == fileh.schema
     assert fileh.schema.equals(fileh2.schema)
@@ -668,6 +673,8 @@ def test_compare_schemas():
     assert not fileh.schema.equals(fileh3.schema)
     assert fileh.schema != fileh3.schema
 
+    # ColumnSchema
+    assert isinstance(fileh.schema[0], _pq.ColumnSchema)
     assert fileh.schema[0].equals(fileh.schema[0])
     assert fileh.schema[0] == fileh.schema[0]
     assert not fileh.schema[0].equals(fileh.schema[1])

From 40ba9651ea63482e285ed0e2b8a0317974acacf0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 26 Jul 2018 12:52:44 +0200
Subject: [PATCH 11/16] expose missing parquet classes to pq namespace

---
 python/pyarrow/parquet.py            |  7 ++++---
 python/pyarrow/tests/test_parquet.py | 10 ++++------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 9c92737bf4b5..90ecdac0101a 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -33,9 +33,10 @@
 import numpy as np
 
 from pyarrow.filesystem import FileSystem, LocalFileSystem, S3FSWrapper
-from pyarrow._parquet import (ParquetReader, FileMetaData,  # noqa
-                              RowGroupMetaData, ParquetSchema)
-import pyarrow._parquet as _parquet  # noqa
+from pyarrow._parquet import (ParquetReader, FileMetaData, RowGroupMetaData,
+                              ColumnChunkMetaData, ParquetSchema, ColumnSchema,
+                              RowGroupStatistics)  # noqa
+import pyarrow._parquet as _parquet
 import pyarrow.lib as lib
 import pyarrow as pa
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 9d94ef551d0d..19bec268609e 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -517,7 +517,6 @@ def make_sample_file(table_or_df):
 
 def test_parquet_metadata_api():
     import pyarrow.parquet as pq
-    import pyarrow._parquet as _pq
 
     df = alltypes_sample(size=10000)
     df = df.reindex(columns=sorted(df.columns))
@@ -561,12 +560,12 @@ def test_parquet_metadata_api():
     # Row group
     for rg in range(meta.num_row_groups):
         rg_meta = meta.row_group(rg)
-        assert isinstance(rg_meta, _pq.RowGroupMetaData)
+        assert isinstance(rg_meta, pq.RowGroupMetaData)
         repr(rg_meta)
 
         for col in range(rg_meta.num_columns):
             col_meta = rg_meta.column(col)
-            assert isinstance(col_meta, _pq.ColumnChunkMetaData)
+            assert isinstance(col_meta, pq.ColumnChunkMetaData)
             repr(col_meta)
 
     rg_meta = meta.row_group(0)
@@ -581,7 +580,7 @@ def test_parquet_metadata_api():
     assert col_meta.num_values == 10000
     assert col_meta.path_in_schema == 'bool'
     assert col_meta.is_stats_set is True
-    assert isinstance(col_meta.statistics, _pq.RowGroupStatistics)
+    assert isinstance(col_meta.statistics, pq.RowGroupStatistics)
     assert col_meta.compression == 'SNAPPY'
     assert col_meta.encodings == ('PLAIN', 'RLE')
     assert col_meta.has_dictionary_page is False
@@ -655,7 +654,6 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value,
 
 def test_compare_schemas():
     import pyarrow.parquet as pq
-    import pyarrow._parquet as _pq
 
     df = alltypes_sample(size=10000)
 
@@ -674,7 +672,7 @@ def test_compare_schemas():
     assert fileh.schema != fileh3.schema
 
     # ColumnSchema
-    assert isinstance(fileh.schema[0], _pq.ColumnSchema)
+    assert isinstance(fileh.schema[0], pq.ColumnSchema)
     assert fileh.schema[0].equals(fileh.schema[0])
     assert fileh.schema[0] == fileh.schema[0]
     assert not fileh.schema[0].equals(fileh.schema[1])

From 74d53bb9fafbb26ddba25ac31fab30e596885bd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 26 Jul 2018 14:14:57 +0200
Subject: [PATCH 12/16] missing distinct_count equals to zero

---
 python/pyarrow/_parquet.pyx          |  5 ++++-
 python/pyarrow/tests/test_parquet.py | 28 +++++++++++++++-------------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index cb9f17df472f..a31752066376 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -222,7 +222,10 @@ cdef class ColumnChunkMetaData:
 
     @property
     def dictionary_page_offset(self):
-        return self.metadata.dictionary_page_offset()
+        if self.has_dictionary_page:
+            return self.metadata.dictionary_page_offset()
+        else:
+            return None
 
     @property
     def data_page_offset(self):
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 19bec268609e..64254789c595 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -584,7 +584,7 @@ def test_parquet_metadata_api():
     assert col_meta.compression == 'SNAPPY'
     assert col_meta.encodings == ('PLAIN', 'RLE')
     assert col_meta.has_dictionary_page is False
-    assert col_meta.dictionary_page_offset == 0
+    assert col_meta.dictionary_page_offset is None
     assert col_meta.data_page_offset > 0
     assert col_meta.index_page_offset == 0
     assert col_meta.total_compressed_size > 0
@@ -603,29 +603,29 @@ def test_parquet_metadata_api():
         'distinct_count'
     ),
     [
-        ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, None),
-        ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, None),
-        ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, None),
-        ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, None),
-        ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, None),
-        ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, None),
-        ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, None),
-        ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, None),
+        ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
         (
             [-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
-            'FLOAT', -1.1, 4.4, 1, 4, None
+            'FLOAT', -1.1, 4.4, 1, 4, 0
         ),
         (
             [-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
-            'DOUBLE', -1.1, 4.4, 1, 4, None
+            'DOUBLE', -1.1, 4.4, 1, 4, 0
         ),
         (
             [u'', u'b', unichar(1000), None, u'aaa'], pa.binary(),
-            'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, None
+            'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0
         ),
         (
             [True, False, False, True, True], pa.bool_(),
-            'BOOLEAN', False, True, 0, 5, None
+            'BOOLEAN', False, True, 0, 5, 0
         ),
     ]
 )
@@ -648,6 +648,8 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value,
     assert stat.max == max_value
     assert stat.null_count == null_count
     assert stat.num_values == num_values
+    # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount
+    # method, missing distinct_count is represented as zero instead of None
     assert stat.distinct_count == distinct_count
     assert stat.physical_type == physical_type
 

From 71f5edee85885b037ef3b2ab0615ea5e4890e49f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 26 Jul 2018 14:47:55 +0200
Subject: [PATCH 13/16] flake8

---
 python/pyarrow/parquet.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 90ecdac0101a..2c1aef0e88a3 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -33,9 +33,10 @@
 import numpy as np
 
 from pyarrow.filesystem import FileSystem, LocalFileSystem, S3FSWrapper
-from pyarrow._parquet import (ParquetReader, FileMetaData, RowGroupMetaData,
-                              ColumnChunkMetaData, ParquetSchema, ColumnSchema,
-                              RowGroupStatistics)  # noqa
+from pyarrow._parquet import (ParquetReader, RowGroupStatistics,  # noqa
+                              FileMetaData, RowGroupMetaData,
+                              ColumnChunkMetaData,
+                              ParquetSchema, ColumnSchema)
 import pyarrow._parquet as _parquet
 import pyarrow.lib as lib
 import pyarrow as pa

From b1e7bede03038b4bc64ae120e530d5e3c749b2d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 26 Jul 2018 16:48:45 +0200
Subject: [PATCH 14/16] raise NotImplementedError for index_page_offset

---
 python/pyarrow/_parquet.pyx          | 12 +++--
 python/pyarrow/tests/test_parquet.py |  5 +-
 python/run_test.sh                   | 70 ++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 6 deletions(-)
 create mode 100755 python/run_test.sh

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index a31752066376..c7d0908836f8 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -156,9 +156,8 @@ cdef class ColumnChunkMetaData:
   has_dictionary_page: {10}
   dictionary_page_offset: {11}
   data_page_offset: {12}
-  index_page_offset: {13}
-  total_compressed_size: {14}
-  total_uncompressed_size: {15}""".format(object.__repr__(self),
+  total_compressed_size: {13}
+  total_uncompressed_size: {14}""".format(object.__repr__(self),
                                           self.file_offset,
                                           self.file_path,
                                           self.physical_type,
@@ -171,7 +170,6 @@ cdef class ColumnChunkMetaData:
                                           self.has_dictionary_page,
                                           self.dictionary_page_offset,
                                           self.data_page_offset,
-                                          self.index_page_offset,
                                           self.total_compressed_size,
                                           self.total_uncompressed_size)
 
@@ -231,9 +229,13 @@ cdef class ColumnChunkMetaData:
     def data_page_offset(self):
         return self.metadata.data_page_offset()
 
+    @property
+    def has_index_page(self):
+        raise NotImplementedError('not supported in parquet-cpp')
+
     @property
     def index_page_offset(self):
-        return self.metadata.index_page_offset()
+        raise NotImplementedError("parquet-cpp doesn't return valid values")
 
     @property
     def total_compressed_size(self):
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 64254789c595..cc86ef16e08a 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -586,9 +586,12 @@ def test_parquet_metadata_api():
     assert col_meta.has_dictionary_page is False
     assert col_meta.dictionary_page_offset is None
     assert col_meta.data_page_offset > 0
-    assert col_meta.index_page_offset == 0
     assert col_meta.total_compressed_size > 0
     assert col_meta.total_uncompressed_size > 0
+    with pytest.raises(NotImplementedError):
+        col_meta.has_index_page
+    with pytest.raises(NotImplementedError):
+        col_meta.index_page_offset
 
 
 @pytest.mark.parametrize(
diff --git a/python/run_test.sh b/python/run_test.sh
new file mode 100755
index 000000000000..49113f87683e
--- /dev/null
+++ b/python/run_test.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+export ARROW_BUILD_TYPE=debug
+export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX
+export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX
+export ARROW_HOME=$CONDA_PREFIX
+export PARQUET_HOME=$CONDA_PREFIX
+# export PYARROW_CMAKE_GENERATOR=Ninja
+
+pushd ../cpp/build
+
+cmake -GNinja \
+      -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
+      -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
+      -DARROW_ORC=off \
+      -DARROW_PYTHON=ON \
+      -DARROW_PLASMA=ON \
+      -DARROW_BUILD_TESTS=ON \
+      -DARROW_EXTRA_ERROR_CONTEXT=ON \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \
+      ..
+ninja
+ninja install
+
+popd
+
+pushd ../../parquet-cpp/build
+
+cmake -GNinja \
+      -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
+      -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \
+      -DPARQUET_BUILD_BENCHMARKS=OFF \
+      -DPARQUET_BUILD_EXECUTABLES=ON \
+      -DPARQUET_BUILD_TESTS=ON \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \
+      ..
+
+ninja
+ninja install
+
+
+popd
+
+
+export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE
+export PYARROW_WITH_PARQUET=1
+export PYARROW_WITH_PLASMA=1
+export PYARROW_WITH_ORC=0
+
+python setup.py build_ext -q --inplace
+
+py.test -sv "$@"

From 77c59d660c34b04cc83deeb963571c1234464c09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 26 Jul 2018 16:52:22 +0200
Subject: [PATCH 15/16] return NotImplemented from equality check

---
 python/pyarrow/_parquet.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index c7d0908836f8..1aa21244114e 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -421,7 +421,7 @@ cdef class ParquetSchema:
         try:
             return self.equals(other)
         except TypeError:
-            return False
+            return NotImplemented
 
     def equals(self, ParquetSchema other):
         """
@@ -449,7 +449,7 @@ cdef class ColumnSchema:
         try:
             return self.equals(other)
         except TypeError:
-            return False
+            return NotImplemented
 
     def equals(self, ColumnSchema other):
         """

From d6a7f7794dbe5f179cb0e19a10bf66e465c79193 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Thu, 26 Jul 2018 16:53:03 +0200
Subject: [PATCH 16/16] remove accidentally committed test helper script

---
 python/run_test.sh | 70 ----------------------------------------------
 1 file changed, 70 deletions(-)
 delete mode 100755 python/run_test.sh

diff --git a/python/run_test.sh b/python/run_test.sh
deleted file mode 100755
index 49113f87683e..000000000000
--- a/python/run_test.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-export ARROW_BUILD_TYPE=debug
-export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX
-export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX
-export ARROW_HOME=$CONDA_PREFIX
-export PARQUET_HOME=$CONDA_PREFIX
-# export PYARROW_CMAKE_GENERATOR=Ninja
-
-pushd ../cpp/build
-
-cmake -GNinja \
-      -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
-      -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-      -DARROW_ORC=off \
-      -DARROW_PYTHON=ON \
-      -DARROW_PLASMA=ON \
-      -DARROW_BUILD_TESTS=ON \
-      -DARROW_EXTRA_ERROR_CONTEXT=ON \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \
-      ..
-ninja
-ninja install
-
-popd
-
-pushd ../../parquet-cpp/build
-
-cmake -GNinja \
-      -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
-      -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \
-      -DPARQUET_BUILD_BENCHMARKS=OFF \
-      -DPARQUET_BUILD_EXECUTABLES=ON \
-      -DPARQUET_BUILD_TESTS=ON \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \
-      ..
-
-ninja
-ninja install
-
-
-popd
-
-
-export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE
-export PYARROW_WITH_PARQUET=1
-export PYARROW_WITH_PLASMA=1
-export PYARROW_WITH_ORC=0
-
-python setup.py build_ext -q --inplace
-
-py.test -sv "$@"