Skip to content

Commit

Permalink
REGR: be able to read Stata files without reading them fully into memory
Browse files Browse the repository at this point in the history
  • Loading branch information
akx committed Oct 11, 2022
1 parent 28da588 commit 300084d
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 5 deletions.
7 changes: 7 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6276,6 +6276,13 @@ values will have ``object`` data type.
``int64`` for all integer types and ``float64`` for floating point data. By default,
the Stata data types are preserved when importing.

.. note::

All :class:`~pandas.io.stata.StataReader`s, whether created by :func:`~pandas.read_stata`
(when using `iterator=True` or `chunksize`) or instantiated by hand, must be closed by
calling :meth:`~pandas.io.stata.StataReader.close` (or by using the ``with`` statement, as
in the examples above) to avoid leaking file handles.

.. ipython:: python
:suppress:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ Performance improvements
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)

.. ---------------------------------------------------------------------------
.. _whatsnew_160.bug_fixes:
Expand Down
18 changes: 13 additions & 5 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1164,15 +1164,23 @@ def __init__(
self._lines_read = 0

self._native_byteorder = _set_endianness(sys.byteorder)
with get_handle(

handles = get_handle(
path_or_buf,
"rb",
storage_options=storage_options,
is_text=False,
compression=compression,
) as handles:
# Copy to BytesIO, and ensure no encoding
self.path_or_buf = BytesIO(handles.handle.read())
)
if hasattr(handles.handle, "seekable") and handles.handle.seekable():
# If the handle is directly seekable, use it without an extra copy.
self.path_or_buf = handles.handle
self._close_file = handles.close
else:
# Copy to memory, and ensure no encoding.
with handles:
self.path_or_buf = BytesIO(handles.handle.read())
self._close_file = self.path_or_buf.close

self._read_header()
self._setup_dtype()
Expand All @@ -1192,7 +1200,7 @@ def __exit__(

def close(self) -> None:
"""close the handle if its open"""
self.path_or_buf.close()
self._close_file()

def _set_encoding(self) -> None:
"""
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1842,6 +1842,29 @@ def test_backward_compat(version, datapath):
tm.assert_frame_equal(old_dta, expected, check_dtype=False)


def test_direct_read(datapath, monkeypatch):
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")

# Test that opening a file path doesn't buffer the file.
with StataReader(file_path) as reader:
# Must not have been buffered to memory
assert not isinstance(reader.path_or_buf, io.BytesIO)
assert not reader.read().empty

# Test that we use a given fp exactly, if possible.
with open(file_path, "rb") as fp:
with StataReader(fp) as reader:
assert reader.path_or_buf is fp
assert not reader.read().empty

# Test that we use a given BytesIO exactly, if possible.
with open(file_path, "rb") as fp:
with io.BytesIO(fp.read()) as bio:
with StataReader(bio) as reader:
assert reader.path_or_buf is bio
assert not reader.read().empty


@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
@pytest.mark.parametrize("use_dict", [True, False])
@pytest.mark.parametrize("infer", [True, False])
Expand Down

0 comments on commit 300084d

Please sign in to comment.