Skip to content

Commit

Permalink
ENH: support 'infer' compression in _get_handle() (pandas-dev#17900)
Browse files Browse the repository at this point in the history
  • Loading branch information
Dobatymo authored and alimcmaster1 committed Aug 11, 2018
1 parent 1122004 commit 41dcedf
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 16 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Expand Up @@ -83,6 +83,7 @@ Other Enhancements
- :func:`read_html` copies cell data across ``colspan``s and ``rowspan``s, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)
- :func:`~DataFrame.to_csv` and :func:`~DataFrame.to_json` now support ``compression='infer'`` to infer compression based on filename (:issue:`15008`)
-

.. _whatsnew_0240.api_breaking:
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/frame.py
Expand Up @@ -1695,10 +1695,10 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
encoding : string, optional
A string representing the encoding to use in the output file,
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
compression : string, optional
A string representing the compression to use in the output file.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
used when the first argument is a filename.
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default None
If 'infer' and `path_or_buf` is path-like, then detect compression
from the following extensions: '.gz', '.bz2' or '.xz'
(otherwise no compression).
line_terminator : string, default ``'\n'``
The newline character or character sequence to use in the output
file
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Expand Up @@ -1906,7 +1906,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
.. versionadded:: 0.19.0
compression : {None, 'gzip', 'bz2', 'zip', 'xz'}
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default None
A string representing the compression to use in the output file,
only used when the first argument is a filename.
Expand Down
17 changes: 12 additions & 5 deletions pandas/io/common.py
Expand Up @@ -267,10 +267,12 @@ def _infer_compression(filepath_or_buffer, compression):
Parameters
----------
filepath_or_buf :
filepath_or_buffer :
a path (str) or buffer
compression : str or None
the compression method including None for no compression and 'infer'
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
If 'infer' and `filepath_or_buffer` is path-like, then detect
compression from the following extensions: '.gz', '.bz2', '.zip',
or '.xz' (otherwise no compression).
Returns
-------
Expand Down Expand Up @@ -322,8 +324,10 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
mode : str
mode to open path_or_buf with
encoding : str or None
compression : str or None
Supported compression protocols are gzip, bz2, zip, and xz
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
If 'infer' and `filepath_or_buffer` is path-like, then detect
compression from the following extensions: '.gz', '.bz2', '.zip',
or '.xz' (otherwise no compression).
memory_map : boolean, default False
See parsers._parser_params for more information.
is_text : boolean, default True
Expand All @@ -350,6 +354,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
path_or_buf = _stringify_path(path_or_buf)
is_path = isinstance(path_or_buf, compat.string_types)

if is_path:
compression = _infer_compression(path_or_buf, compression)

if compression:

if compat.PY2 and not is_path and encoding:
Expand Down
8 changes: 3 additions & 5 deletions pandas/io/pickle.py
Expand Up @@ -5,7 +5,7 @@
from numpy.lib.format import read_array, write_array
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
from pandas.core.dtypes.common import is_datetime64_dtype, _NS_DTYPE
from pandas.io.common import _get_handle, _infer_compression, _stringify_path
from pandas.io.common import _get_handle, _stringify_path


def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
Expand Down Expand Up @@ -67,9 +67,8 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
>>> os.remove("./dummy.pkl")
"""
path = _stringify_path(path)
inferred_compression = _infer_compression(path, compression)
f, fh = _get_handle(path, 'wb',
compression=inferred_compression,
compression=compression,
is_text=False)
if protocol < 0:
protocol = pkl.HIGHEST_PROTOCOL
Expand Down Expand Up @@ -138,12 +137,11 @@ def read_pickle(path, compression='infer'):
>>> os.remove("./dummy.pkl")
"""
path = _stringify_path(path)
inferred_compression = _infer_compression(path, compression)

def read_wrapper(func):
# wrapper file handle open/close operation
f, fh = _get_handle(path, 'rb',
compression=inferred_compression,
compression=compression,
is_text=False)
try:
return func(f)
Expand Down
37 changes: 36 additions & 1 deletion pandas/tests/io/formats/test_to_csv.py
@@ -1,9 +1,12 @@
# -*- coding: utf-8 -*-

import sys

import pytest

import numpy as np
import pandas as pd
import pytest

from pandas import DataFrame
from pandas.util import testing as tm

Expand Down Expand Up @@ -316,3 +319,35 @@ def test_to_csv_write_to_open_file(self):
df.to_csv(f, header=None, index=None)
with open(path, 'r') as f:
assert f.read() == expected

@pytest.mark.parametrize("to_infer", [True, False])
@pytest.mark.parametrize("read_infer", [True, False])
def test_to_csv_compression(self, compression_only,
read_infer, to_infer):
# see gh-15008
compression = compression_only

if compression == "zip":
pytest.skip("{compression} is not supported "
"for to_csv".format(compression=compression))

# We'll complete file extension subsequently.
filename = "test."

if compression == "gzip":
filename += "gz"
else:
# xz --> .xz
# bz2 --> .bz2
filename += compression

df = DataFrame({"A": [1]})

to_compression = "infer" if to_infer else compression
read_compression = "infer" if read_infer else compression

with tm.ensure_clean(filename) as path:
df.to_csv(path, compression=to_compression)
result = pd.read_csv(path, index_col=0,
compression=read_compression)
tm.assert_frame_equal(result, df)
32 changes: 32 additions & 0 deletions pandas/tests/io/json/test_compression.py
Expand Up @@ -88,3 +88,35 @@ def test_read_unsupported_compression_type():
msg = "Unrecognized compression type: unsupported"
assert_raises_regex(ValueError, msg, pd.read_json,
path, compression="unsupported")


@pytest.mark.parametrize("to_infer", [True, False])
@pytest.mark.parametrize("read_infer", [True, False])
def test_to_json_compression(compression_only,
read_infer, to_infer):
# see gh-15008
compression = compression_only

if compression == "zip":
pytest.skip("{compression} is not supported "
"for to_csv".format(compression=compression))

# We'll complete file extension subsequently.
filename = "test."

if compression == "gzip":
filename += "gz"
else:
# xz --> .xz
# bz2 --> .bz2
filename += compression

df = pd.DataFrame({"A": [1]})

to_compression = "infer" if to_infer else compression
read_compression = "infer" if read_infer else compression

with tm.ensure_clean(filename) as path:
df.to_json(path, compression=to_compression)
result = pd.read_json(path, compression=read_compression)
tm.assert_frame_equal(result, df)

0 comments on commit 41dcedf

Please sign in to comment.