Skip to content

Commit

Permalink
[SPARK-33983][PYTHON] Update cloudpickle to v1.6.0
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

This PR proposes to upgrade cloudpickle from 1.5.0 to 1.6.0.
It virtually contains one fix:

cloudpipe/cloudpickle@4510be8

From a cursory look, this isn't a regression, and not even properly supported in Python:

```python
>>> import pickle
>>> pickle.dumps({}.keys())
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: cannot pickle 'dict_keys' object
```

So it seems fine not to backport.

### Why are the changes needed?

To leverage bug fixes from the cloudpickle upstream.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Jenkins build and GitHub actions build will test it out.

Closes #31007 from HyukjinKwon/cloudpickle-upgrade.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
  • Loading branch information
HyukjinKwon authored and dongjoon-hyun committed Jan 4, 2021
1 parent 414d323 commit d6322bf
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 13 deletions.
6 changes: 5 additions & 1 deletion python/pyspark/cloudpickle/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@
from pyspark.cloudpickle.cloudpickle import * # noqa
from pyspark.cloudpickle.cloudpickle_fast import CloudPickler, dumps, dump # noqa

__version__ = '1.5.0'
# Conform to the convention used by python serialization libraries, which
# expose their Pickler subclass at top-level under the "Pickler" name.
Pickler = CloudPickler

__version__ = '1.6.0'
22 changes: 17 additions & 5 deletions python/pyspark/cloudpickle/cloudpickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def g():
DEFAULT_PROTOCOL = pickle.HIGHEST_PROTOCOL

# Track the provenance of reconstructed dynamic classes to make it possible to
# reconstruct instances from the matching singleton class definition when
# recontruct instances from the matching singleton class definition when
# appropriate and preserve the usual "isinstance" semantics of Python objects.
_DYNAMIC_CLASS_TRACKER_BY_CLASS = weakref.WeakKeyDictionary()
_DYNAMIC_CLASS_TRACKER_BY_ID = weakref.WeakValueDictionary()
Expand Down Expand Up @@ -236,7 +236,7 @@ def _extract_code_globals(co):
out_names = {names[oparg] for _, oparg in _walk_global_ops(co)}

# Declaring a function inside another one using the "def ..."
# syntax generates a constant code object corresponding to the one
# syntax generates a constant code object corresonding to the one
# of the nested function's As the nested function may itself need
# global variables, we need to introspect its code, extract its
# globals, (look for code object in it's co_consts attribute..) and
Expand Down Expand Up @@ -457,7 +457,7 @@ def _is_parametrized_type_hint(obj):
is_typing = getattr(obj, '__origin__', None) is not None

# typing_extensions.Literal
is_literal = getattr(obj, '__values__', None) is not None
is_litteral = getattr(obj, '__values__', None) is not None

# typing_extensions.Final
is_final = getattr(obj, '__type__', None) is not None
Expand All @@ -469,7 +469,7 @@ def _is_parametrized_type_hint(obj):
getattr(obj, '__result__', None) is not None and
getattr(obj, '__args__', None) is not None
)
return any((is_typing, is_literal, is_final, is_union, is_tuple,
return any((is_typing, is_litteral, is_final, is_union, is_tuple,
is_callable))

def _create_parametrized_type_hint(origin, args):
Expand Down Expand Up @@ -699,7 +699,7 @@ def _make_skel_func(code, cell_count, base_globals=None):
"""
# This function is deprecated and should be removed in cloudpickle 1.7
warnings.warn(
"A pickle file created using an old (<=1.4.1) version of cloudpickle "
"A pickle file created using an old (<=1.4.1) version of cloudpicke "
"is currently being loaded. This is not supported by cloudpickle and "
"will break in cloudpickle 1.7", category=UserWarning
)
Expand Down Expand Up @@ -828,3 +828,15 @@ def _get_bases(typ):
# For regular class objects
bases_attr = '__bases__'
return getattr(typ, bases_attr)


def _make_dict_keys(obj):
return dict.fromkeys(obj).keys()


def _make_dict_values(obj):
return {i: _ for i, _ in enumerate(obj)}.values()


def _make_dict_items(obj):
return obj.items()
37 changes: 30 additions & 7 deletions python/pyspark/cloudpickle/cloudpickle_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
is only available for Python versions 3.8+, a lot of backward-compatibility
code is also removed.
Note that the C Pickler subclassing API is CPython-specific. Therefore, some
Note that the C Pickler sublassing API is CPython-specific. Therefore, some
guards present in cloudpickle.py that were written to handle PyPy specificities
are not present in cloudpickle_fast.py
"""
import _collections_abc
import abc
import copyreg
import io
Expand All @@ -33,8 +34,8 @@
_typevar_reduce, _get_bases, _make_cell, _make_empty_cell, CellType,
_is_parametrized_type_hint, PYPY, cell_set,
parametrized_type_hint_getinitargs, _create_parametrized_type_hint,
builtin_code_type

builtin_code_type,
_make_dict_keys, _make_dict_values, _make_dict_items,
)


Expand Down Expand Up @@ -179,7 +180,7 @@ def _class_getstate(obj):
clsdict.pop('__weakref__', None)

if issubclass(type(obj), abc.ABCMeta):
# If obj is an instance of an ABCMeta subclass, don't pickle the
# If obj is an instance of an ABCMeta subclass, dont pickle the
# cache/negative caches populated during isinstance/issubclass
# checks, but pickle the list of registered subclasses of obj.
clsdict.pop('_abc_cache', None)
Expand Down Expand Up @@ -400,14 +401,32 @@ def _class_reduce(obj):
return NotImplemented


def _dict_keys_reduce(obj):
# Safer not to ship the full dict as sending the rest might
# be unintended and could potentially cause leaking of
# sensitive information
return _make_dict_keys, (list(obj), )


def _dict_values_reduce(obj):
# Safer not to ship the full dict as sending the rest might
# be unintended and could potentially cause leaking of
# sensitive information
return _make_dict_values, (list(obj), )


def _dict_items_reduce(obj):
return _make_dict_items, (dict(obj), )


# COLLECTIONS OF OBJECTS STATE SETTERS
# ------------------------------------
# state setters are called at unpickling time, once the object is created and
# it has to be updated to how it was at unpickling time.


def _function_setstate(obj, state):
"""Update the state of a dynamic function.
"""Update the state of a dynaamic function.
As __closure__ and __globals__ are readonly attributes of a function, we
cannot rely on the native setstate routine of pickle.load_build, that calls
Expand Down Expand Up @@ -473,6 +492,10 @@ class CloudPickler(Pickler):
_dispatch_table[types.MappingProxyType] = _mappingproxy_reduce
_dispatch_table[weakref.WeakSet] = _weakset_reduce
_dispatch_table[typing.TypeVar] = _typevar_reduce
_dispatch_table[_collections_abc.dict_keys] = _dict_keys_reduce
_dispatch_table[_collections_abc.dict_values] = _dict_values_reduce
_dispatch_table[_collections_abc.dict_items] = _dict_items_reduce


dispatch_table = ChainMap(_dispatch_table, copyreg.dispatch_table)

Expand Down Expand Up @@ -556,7 +579,7 @@ def dump(self, obj):
# `dispatch` attribute. Earlier versions of the protocol 5 CloudPickler
# used `CloudPickler.dispatch` as a class-level attribute storing all
# reducers implemented by cloudpickle, but the attribute name was not a
# great choice given the meaning of `CloudPickler.dispatch` when
# great choice given the meaning of `Cloudpickler.dispatch` when
# `CloudPickler` extends the pure-python pickler.
dispatch = dispatch_table

Expand Down Expand Up @@ -630,7 +653,7 @@ def reducer_override(self, obj):
return self._function_reduce(obj)
else:
# fallback to save_global, including the Pickler's
# dispatch_table
# distpatch_table
return NotImplemented

else:
Expand Down

0 comments on commit d6322bf

Please sign in to comment.