From 300084df2caae888642481a6c3c4de1906dc03dd Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Mon, 3 Oct 2022 18:25:46 +0300
Subject: [PATCH] REGR: be able to read Stata files without reading them fully
 into memory

Fixes #48700
Refs pandas-dev/pandas#9245
Refs pandas-dev/pandas#37639
Regressed in 6d1541e1782a7b94797d5432922e64a97934cfa4
---
 doc/source/user_guide/io.rst   |  7 +++++++
 doc/source/whatsnew/v1.6.0.rst |  1 +
 pandas/io/stata.py             | 18 +++++++++++++-----
 pandas/tests/io/test_stata.py  | 23 +++++++++++++++++++++++
 4 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 1552f2a8d257ba..98b9efb59b57a9 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -6276,6 +6276,13 @@ values will have ``object`` data type.
    ``int64`` for all integer types and ``float64`` for floating point data.  By default,
    the Stata data types are preserved when importing.
 
+.. note::
+
+   All :class:`~pandas.io.stata.StataReader`s, whether created by :func:`~pandas.read_stata`
+   (when using `iterator=True` or `chunksize`) or instantiated by hand, must be closed by
+   calling :meth:`~pandas.io.stata.StataReader.close` (or by using the ``with`` statement, as
+   in the examples above) to avoid leaking file handles.
+
 .. ipython:: python
    :suppress:
 
diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
index 0cad6f3caaf919..146aa9d8d0053d 100644
--- a/doc/source/whatsnew/v1.6.0.rst
+++ b/doc/source/whatsnew/v1.6.0.rst
@@ -156,6 +156,7 @@ Performance improvements
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
 - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
 - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
+- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_160.bug_fixes:
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 6baf5f0da86124..640c0d5cd5c63f 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -1164,15 +1164,23 @@ def __init__(
         self._lines_read = 0
 
         self._native_byteorder = _set_endianness(sys.byteorder)
-        with get_handle(
+
+        handles = get_handle(
             path_or_buf,
             "rb",
             storage_options=storage_options,
             is_text=False,
             compression=compression,
-        ) as handles:
-            # Copy to BytesIO, and ensure no encoding
-            self.path_or_buf = BytesIO(handles.handle.read())
+        )
+        if hasattr(handles.handle, "seekable") and handles.handle.seekable():
+            # If the handle is directly seekable, use it without an extra copy.
+            self.path_or_buf = handles.handle
+            self._close_file = handles.close
+        else:
+            # Copy to memory, and ensure no encoding.
+            with handles:
+                self.path_or_buf = BytesIO(handles.handle.read())
+            self._close_file = self.path_or_buf.close
 
         self._read_header()
         self._setup_dtype()
@@ -1192,7 +1200,7 @@ def __exit__(
 
     def close(self) -> None:
         """close the handle if its open"""
-        self.path_or_buf.close()
+        self._close_file()
 
     def _set_encoding(self) -> None:
         """
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index a4e4751d753474..2d98ee2d481a90 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -1842,6 +1842,29 @@ def test_backward_compat(version, datapath):
     tm.assert_frame_equal(old_dta, expected, check_dtype=False)
 
 
+def test_direct_read(datapath, monkeypatch):
+    file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
+
+    # Test that opening a file path doesn't buffer the file.
+    with StataReader(file_path) as reader:
+        # Must not have been buffered to memory
+        assert not isinstance(reader.path_or_buf, io.BytesIO)
+        assert not reader.read().empty
+
+    # Test that we use a given fp exactly, if possible.
+    with open(file_path, "rb") as fp:
+        with StataReader(fp) as reader:
+            assert reader.path_or_buf is fp
+            assert not reader.read().empty
+
+    # Test that we use a given BytesIO exactly, if possible.
+    with open(file_path, "rb") as fp:
+        with io.BytesIO(fp.read()) as bio:
+            with StataReader(bio) as reader:
+                assert reader.path_or_buf is bio
+                assert not reader.read().empty
+
+
 @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
 @pytest.mark.parametrize("use_dict", [True, False])
 @pytest.mark.parametrize("infer", [True, False])