Skip to content

Commit

Permalink
ARROW-1971: [Python] Add pandas serialization to the default
Browse files Browse the repository at this point in the history
Moving pandas register into default register.

Author: devin-petersohn <devin.petersohn@gmail.com>

Closes #1462 from devin-petersohn/jira/1971_pandas_serialization and squashes the following commits:

b3dfd5b [devin-petersohn] Removing slower codepath
2ed3137 [devin-petersohn] Moving pandas register into default register
  • Loading branch information
devin-petersohn authored and wesm committed Jan 10, 2018
1 parent f82b7e4 commit b49e8f3
Showing 1 changed file with 44 additions and 80 deletions.
124 changes: 44 additions & 80 deletions python/pyarrow/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import numpy as np

from pyarrow import serialize_pandas, deserialize_pandas
from pyarrow.compat import builtin_pickle
from pyarrow.lib import _default_serialization_context, frombuffer

Expand Down Expand Up @@ -61,6 +60,48 @@ def _load_pickle_from_buffer(data):
_deserialize_numpy_array_pickle = _load_pickle_from_buffer


# ----------------------------------------------------------------------
# pandas-specific serialization matters

def _register_custom_pandas_handlers(context):
# ARROW-1784, faster path for pandas-only visibility

try:
import pandas as pd
except ImportError:
return

import pyarrow.pandas_compat as pdcompat

def _serialize_pandas_dataframe(obj):
return pdcompat.dataframe_to_serialized_dict(obj)

def _deserialize_pandas_dataframe(data):
return pdcompat.serialized_dict_to_dataframe(data)

def _serialize_pandas_series(obj):
return _serialize_pandas_dataframe(pd.DataFrame({obj.name: obj}))

def _deserialize_pandas_series(data):
deserialized = _deserialize_pandas_dataframe(data)
return deserialized[deserialized.columns[0]]

context.register_type(
pd.Series, 'pd.Series',
custom_serializer=_serialize_pandas_series,
custom_deserializer=_deserialize_pandas_series)

context.register_type(
pd.Index, 'pd.Index',
custom_serializer=_pickle_to_buffer,
custom_deserializer=_load_pickle_from_buffer)

context.register_type(
pd.DataFrame, 'pd.DataFrame',
custom_serializer=_serialize_pandas_dataframe,
custom_deserializer=_deserialize_pandas_dataframe)


def register_default_serialization_handlers(serialization_context):

# ----------------------------------------------------------------------
Expand Down Expand Up @@ -136,90 +177,13 @@ def _deserialize_torch_tensor(data):
# no torch
pass


register_default_serialization_handlers(_default_serialization_context)
_register_custom_pandas_handlers(serialization_context)


# ----------------------------------------------------------------------
# pandas-specific serialization matters

register_default_serialization_handlers(_default_serialization_context)

pandas_serialization_context = _default_serialization_context.clone()


def _register_pandas_arrow_handlers(context):
try:
import pandas as pd
except ImportError:
return

def _serialize_pandas_series(obj):
return serialize_pandas(pd.DataFrame({obj.name: obj}))

def _deserialize_pandas_series(data):
deserialized = deserialize_pandas(data)
return deserialized[deserialized.columns[0]]

def _serialize_pandas_dataframe(obj):
return serialize_pandas(obj)

def _deserialize_pandas_dataframe(data):
return deserialize_pandas(data)

context.register_type(
pd.Series, 'pd.Series',
custom_serializer=_serialize_pandas_series,
custom_deserializer=_deserialize_pandas_series)

context.register_type(
pd.DataFrame, 'pd.DataFrame',
custom_serializer=_serialize_pandas_dataframe,
custom_deserializer=_deserialize_pandas_dataframe)


def _register_custom_pandas_handlers(context):
# ARROW-1784, faster path for pandas-only visibility

try:
import pandas as pd
except ImportError:
return

import pyarrow.pandas_compat as pdcompat

def _serialize_pandas_dataframe(obj):
return pdcompat.dataframe_to_serialized_dict(obj)

def _deserialize_pandas_dataframe(data):
return pdcompat.serialized_dict_to_dataframe(data)

def _serialize_pandas_series(obj):
return _serialize_pandas_dataframe(pd.DataFrame({obj.name: obj}))

def _deserialize_pandas_series(data):
deserialized = _deserialize_pandas_dataframe(data)
return deserialized[deserialized.columns[0]]

context.register_type(
pd.Series, 'pd.Series',
custom_serializer=_serialize_pandas_series,
custom_deserializer=_deserialize_pandas_series)

context.register_type(
pd.Index, 'pd.Index',
custom_serializer=_pickle_to_buffer,
custom_deserializer=_load_pickle_from_buffer)

context.register_type(
pd.DataFrame, 'pd.DataFrame',
custom_serializer=_serialize_pandas_dataframe,
custom_deserializer=_deserialize_pandas_dataframe)


_register_pandas_arrow_handlers(_default_serialization_context)
_register_custom_pandas_handlers(pandas_serialization_context)


pandas_serialization_context.register_type(
np.ndarray, 'np.array',
custom_serializer=_serialize_numpy_array_pickle,
Expand Down

0 comments on commit b49e8f3

Please sign in to comment.