From 300084df2caae888642481a6c3c4de1906dc03dd Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 3 Oct 2022 18:25:46 +0300 Subject: [PATCH] REGR: be able to read Stata files without reading them fully into memory Fixes #48700 Refs pandas-dev/pandas#9245 Refs pandas-dev/pandas#37639 Regressed in 6d1541e1782a7b94797d5432922e64a97934cfa4 --- doc/source/user_guide/io.rst | 7 +++++++ doc/source/whatsnew/v1.6.0.rst | 1 + pandas/io/stata.py | 18 +++++++++++++----- pandas/tests/io/test_stata.py | 23 +++++++++++++++++++++++ 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1552f2a8d257ba..98b9efb59b57a9 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -6276,6 +6276,13 @@ values will have ``object`` data type. ``int64`` for all integer types and ``float64`` for floating point data. By default, the Stata data types are preserved when importing. +.. note:: + + All :class:`~pandas.io.stata.StataReader`s, whether created by :func:`~pandas.read_stata` + (when using `iterator=True` or `chunksize`) or instantiated by hand, must be closed by + calling :meth:`~pandas.io.stata.StataReader.close` (or by using the ``with`` statement, as + in the examples above) to avoid leaking file handles. + .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 0cad6f3caaf919..146aa9d8d0053d 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -156,6 +156,7 @@ Performance improvements - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) +- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`) .. --------------------------------------------------------------------------- .. _whatsnew_160.bug_fixes: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 6baf5f0da86124..640c0d5cd5c63f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1164,15 +1164,23 @@ def __init__( self._lines_read = 0 self._native_byteorder = _set_endianness(sys.byteorder) - with get_handle( + + handles = get_handle( path_or_buf, "rb", storage_options=storage_options, is_text=False, compression=compression, - ) as handles: - # Copy to BytesIO, and ensure no encoding - self.path_or_buf = BytesIO(handles.handle.read()) + ) + if hasattr(handles.handle, "seekable") and handles.handle.seekable(): + # If the handle is directly seekable, use it without an extra copy. + self.path_or_buf = handles.handle + self._close_file = handles.close + else: + # Copy to memory, and ensure no encoding. + with handles: + self.path_or_buf = BytesIO(handles.handle.read()) + self._close_file = self.path_or_buf.close self._read_header() self._setup_dtype() @@ -1192,7 +1200,7 @@ def __exit__( def close(self) -> None: """close the handle if its open""" - self.path_or_buf.close() + self._close_file() def _set_encoding(self) -> None: """ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a4e4751d753474..2d98ee2d481a90 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1842,6 +1842,29 @@ def test_backward_compat(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) +def test_direct_read(datapath, monkeypatch): + file_path = datapath("io", "data", "stata", "stata-compat-118.dta") + + # Test that opening a file path doesn't buffer the file. + with StataReader(file_path) as reader: + # Must not have been buffered to memory + assert not isinstance(reader.path_or_buf, io.BytesIO) + assert not reader.read().empty + + # Test that we use a given fp exactly, if possible. + with open(file_path, "rb") as fp: + with StataReader(fp) as reader: + assert reader.path_or_buf is fp + assert not reader.read().empty + + # Test that we use a given BytesIO exactly, if possible. + with open(file_path, "rb") as fp: + with io.BytesIO(fp.read()) as bio: + with StataReader(bio) as reader: + assert reader.path_or_buf is bio + assert not reader.read().empty + + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize("use_dict", [True, False]) @pytest.mark.parametrize("infer", [True, False])