Skip to content

Commit

Permalink
ARROW-1120: Support for writing timestamp(ns) to Int96
Browse files Browse the repository at this point in the history
cc @c-nichols

Author: Uwe L. Korn <uwelk@xhochy.com>
Author: Colin Nichols <nichols01@gmail.com>

Closes #865 from xhochy/ARROW-1120 and squashes the following commits:

ff70832 [Uwe L. Korn] Use integer division
99f825d [Uwe L. Korn] Add flag for timestamp[ns] roundtrips
7c28835 [Colin Nichols] ARROW-1120 Support for writing timestamp(ns) to Int96
  • Loading branch information
xhochy authored and wesm committed Jul 18, 2017
1 parent 362e754 commit c5a89b7
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 10 deletions.
9 changes: 9 additions & 0 deletions python/pyarrow/_parquet.pxd
Expand Up @@ -247,8 +247,17 @@ cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil:
CStatus Open(const CSchema& schema, CMemoryPool* pool,
const shared_ptr[OutputStream]& sink,
const shared_ptr[WriterProperties]& properties,
const shared_ptr[ArrowWriterProperties]& arrow_properties,
unique_ptr[FileWriter]* writer)

CStatus WriteTable(const CTable& table, int64_t chunk_size)
CStatus NewRowGroup(int64_t chunk_size)
CStatus Close()

cdef cppclass ArrowWriterProperties:
cppclass Builder:
Builder()
Builder* disable_deprecated_int96_timestamps()
Builder* enable_deprecated_int96_timestamps()
shared_ptr[ArrowWriterProperties] build()
c_bool support_deprecated_int96_timestamps()
17 changes: 15 additions & 2 deletions python/pyarrow/_parquet.pyx
Expand Up @@ -545,13 +545,14 @@ cdef class ParquetWriter:

cdef readonly:
object use_dictionary
object use_deprecated_int96_timestamps
object compression
object version
int row_group_size

def __cinit__(self, where, Schema schema, use_dictionary=None,
compression=None, version=None,
MemoryPool memory_pool=None):
MemoryPool memory_pool=None, use_deprecated_int96_timestamps=False):
cdef:
shared_ptr[FileOutputStream] filestream
shared_ptr[OutputStream] sink
Expand All @@ -566,17 +567,29 @@ cdef class ParquetWriter:
self.use_dictionary = use_dictionary
self.compression = compression
self.version = version
self.use_deprecated_int96_timestamps = use_deprecated_int96_timestamps

cdef WriterProperties.Builder properties_builder
self._set_version(&properties_builder)
self._set_compression_props(&properties_builder)
self._set_dictionary_props(&properties_builder)
properties = properties_builder.build()

cdef ArrowWriterProperties.Builder arrow_properties_builder
self._set_int96_support(&arrow_properties_builder)
arrow_properties = arrow_properties_builder.build()

check_status(
FileWriter.Open(deref(schema.schema),
maybe_unbox_memory_pool(memory_pool),
sink, properties, &self.writer))
sink, properties, arrow_properties,
&self.writer))

cdef void _set_int96_support(self, ArrowWriterProperties.Builder* props):
if self.use_deprecated_int96_timestamps:
props.enable_deprecated_int96_timestamps()
else:
props.disable_deprecated_int96_timestamps()

cdef void _set_version(self, WriterProperties.Builder* props):
if self.version is not None:
Expand Down
11 changes: 7 additions & 4 deletions python/pyarrow/parquet.py
Expand Up @@ -743,7 +743,8 @@ def read_pandas(source, columns=None, nthreads=1, metadata=None):


def write_table(table, where, row_group_size=None, version='1.0',
use_dictionary=True, compression='snappy', **kwargs):
use_dictionary=True, compression='snappy',
use_deprecated_int96_timestamps=False, **kwargs):
"""
Write a Table to Parquet format
Expand All @@ -766,12 +767,13 @@ def write_table(table, where, row_group_size=None, version='1.0',
writer = ParquetWriter(where, table.schema,
use_dictionary=use_dictionary,
compression=compression,
version=version)
version=version,
use_deprecated_int96_timestamps=use_deprecated_int96_timestamps)
writer.write_table(table, row_group_size=row_group_size)
writer.close()


def write_metadata(schema, where, version='1.0'):
def write_metadata(schema, where, version='1.0', use_deprecated_int96_timestamps=False):
"""
Write metadata-only Parquet file from schema
Expand All @@ -782,5 +784,6 @@ def write_metadata(schema, where, version='1.0'):
version : {"1.0", "2.0"}, default "1.0"
The Parquet format version, defaults to 1.0
"""
writer = ParquetWriter(where, schema, version=version)
writer = ParquetWriter(where, schema, version=version,
use_deprecated_int96_timestamps=use_deprecated_int96_timestamps)
writer.close()
33 changes: 29 additions & 4 deletions python/pyarrow/tests/test_parquet.py
Expand Up @@ -456,20 +456,45 @@ def test_date_time_types():
ex_t6 = pa.time32('ms')
ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
t7 = pa.timestamp('ns')
start = pd.Timestamp('2001-01-01').value
data7 = np.array([start, start + 1, start + 2], dtype='int64')
a7 = pa.Array.from_pandas(data7, type=t7)

t7_us = pa.timestamp('us')
start = pd.Timestamp('2001-01-01').value
data7_us = np.array([start, start + 1, start + 2], dtype='int64') // 1000
a7_us = pa.Array.from_pandas(data7_us, type=t7_us)

table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
['date32', 'date64', 'timestamp[us]',
'time32[s]', 'time64[us]',
'time32_from64[s]'])
'time32_from64[s]',
'timestamp[ns]'])

# date64 as date32
# time32[s] to time32[ms]
expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
# 'timestamp[ns]' to 'timestamp[us]'
expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us],
['date32', 'date64', 'timestamp[us]',
'time32[s]', 'time64[us]',
'time32_from64[s]'])
'time32_from64[s]',
'timestamp[ns]'])

_check_roundtrip(table, expected=expected, version='2.0')

# date64 as date32
# time32[s] to time32[ms]
# 'timestamp[ns]' is saved as INT96 timestamp
expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
['date32', 'date64', 'timestamp[us]',
'time32[s]', 'time64[us]',
'time32_from64[s]',
'timestamp[ns]'])

_check_roundtrip(table, expected=expected, version='2.0',
use_deprecated_int96_timestamps=True)

# Unsupported stuff
def _assert_unsupported(array):
table = pa.Table.from_arrays([array], ['unsupported'])
Expand Down

0 comments on commit c5a89b7

Please sign in to comment.