Skip to content

Commit

Permalink
REGR: be able to read Stata files without reading them fully into memory
Browse files Browse the repository at this point in the history
Fixes pandas-dev#48700
Regressed in pandas-dev#9245
Regressed in 2f0ada3
  • Loading branch information
akx committed Oct 5, 2022
1 parent e25aa9d commit 2312ed1
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 5 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ MultiIndex
I/O
^^^
- Bug in :func:`read_sas` caused fragmentation of :class:`DataFrame` and raised :class:`.errors.PerformanceWarning` (:issue:`48595`)
- Regression in :class:`StataReader` caused all files to needlessly be buffered in memory (:issue:`48922`)
-

Period
Expand Down
18 changes: 13 additions & 5 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1164,15 +1164,23 @@ def __init__(
self._lines_read = 0

self._native_byteorder = _set_endianness(sys.byteorder)
with get_handle(

handles = get_handle(
path_or_buf,
"rb",
storage_options=storage_options,
is_text=False,
compression=compression,
) as handles:
# Copy to BytesIO, and ensure no encoding
self.path_or_buf = BytesIO(handles.handle.read())
)
if hasattr(handles.handle, "seekable") and handles.handle.seekable():
# If the handle is directly seekable, use it without an extra copy.
self.path_or_buf = handles.handle
self._close_file = handles.close
else:
# Copy to memory, and ensure no encoding.
with handles:
self.path_or_buf = BytesIO(handles.handle.read())
self._close_file = self.path_or_buf.close

self._read_header()
self._setup_dtype()
Expand All @@ -1192,7 +1200,7 @@ def __exit__(

def close(self) -> None:
"""close the handle if its open"""
self.path_or_buf.close()
self._close_file()

def _set_encoding(self) -> None:
"""
Expand Down

0 comments on commit 2312ed1

Please sign in to comment.