Skip to content

Commit

Permalink
BUG: Correct behavior when reading empty dta files
Browse files Browse the repository at this point in the history
Correct column selection when reading empty dta files
Correct omitted dtype information in empty dta files

closes pandas-dev#46240
  • Loading branch information
bashtage committed May 15, 2023
1 parent 3cfd868 commit d79f2d7
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 13 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.1.0.rst
Expand Up @@ -377,6 +377,7 @@ Missing
MultiIndex
^^^^^^^^^^
- Bug in :meth:`MultiIndex.set_levels` not preserving dtypes for :class:`Categorical` (:issue:`52125`)
- Bug in displaying a :class:`MultiIndex` with a long element (:issue:`52960`)

I/O
^^^
Expand All @@ -386,7 +387,7 @@ I/O
- Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`)
- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
- Bug in displaying a :class:`MultiIndex` with a long element (:issue:`52960`)
- Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`)

Period
^^^^^^
Expand Down
36 changes: 24 additions & 12 deletions pandas/io/stata.py
Expand Up @@ -608,9 +608,10 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
# Replace with NumPy-compatible column
data[col] = data[col].astype(data[col].dtype.numpy_dtype)
dtype = data[col].dtype
empty_df = data.shape[0] == 0
for c_data in conversion_data:
if dtype == c_data[0]:
if data[col].max() <= np.iinfo(c_data[1]).max:
if empty_df or data[col].max() <= np.iinfo(c_data[1]).max:
dtype = c_data[1]
else:
dtype = c_data[2]
Expand All @@ -621,14 +622,17 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
data[col] = data[col].astype(dtype)

# Check values and upcast if necessary
if dtype == np.int8:

if dtype == np.int8 and not empty_df:
if data[col].max() > 100 or data[col].min() < -127:
data[col] = data[col].astype(np.int16)
elif dtype == np.int16:
elif dtype == np.int16 and not empty_df:
if data[col].max() > 32740 or data[col].min() < -32767:
data[col] = data[col].astype(np.int32)
elif dtype == np.int64:
if data[col].max() <= 2147483620 and data[col].min() >= -2147483647:
if empty_df or (
data[col].max() <= 2147483620 and data[col].min() >= -2147483647
):
data[col] = data[col].astype(np.int32)
else:
data[col] = data[col].astype(np.float64)
Expand Down Expand Up @@ -1700,13 +1704,6 @@ def read(
order_categoricals: bool | None = None,
) -> DataFrame:
self._ensure_open()
# Handle empty file or chunk. If reading incrementally raise
# StopIteration. If reading the whole thing return an empty
# data frame.
if (self._nobs == 0) and (nrows is None):
self._can_read_value_labels = True
self._data_read = True
return DataFrame(columns=self._varlist)

# Handle options
if convert_dates is None:
Expand All @@ -1723,10 +1720,25 @@ def read(
order_categoricals = self._order_categoricals
if index_col is None:
index_col = self._index_col

if nrows is None:
nrows = self._nobs

# Handle empty file or chunk. If reading incrementally raise
# StopIteration. If reading the whole thing return an empty
# data frame.
if (self._nobs == 0) and nrows == 0:
self._can_read_value_labels = True
self._data_read = True
data = DataFrame(columns=self._varlist)
# Apply dtypes correctly
for i, col in enumerate(data.columns):
if isinstance(self._dtyplist[i], np.dtype):
if self._dtyplist[i].char != "S":
data[col] = data[col].astype(self._dtyplist[i])
if columns is not None:
data = self._do_select_columns(data, columns)
return data

if (self._format_version >= 117) and (not self._value_labels_read):
self._can_read_value_labels = True
self._read_strls()
Expand Down
53 changes: 53 additions & 0 deletions pandas/tests/io/test_stata.py
Expand Up @@ -71,6 +71,41 @@ def test_read_empty_dta(self, version):
empty_ds2 = read_stata(path)
tm.assert_frame_equal(empty_ds, empty_ds2)

@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
def test_read_empty_dta_with_dtypes(self, version):
# GH 46240
# Fixing above bug revealed that types are not correctly preserved when
# writing empty DataFrames
empty_df_typed = DataFrame(
{
"i8": np.array([0], dtype=np.int8),
"i16": np.array([0], dtype=np.int16),
"i32": np.array([0], dtype=np.int32),
"i64": np.array([0], dtype=np.int64),
"u8": np.array([0], dtype=np.uint8),
"u16": np.array([0], dtype=np.uint16),
"u32": np.array([0], dtype=np.uint32),
"u64": np.array([0], dtype=np.uint64),
"f32": np.array([0], dtype=np.float32),
"f64": np.array([0], dtype=np.float64),
}
)
expected = empty_df_typed.copy()
# No uint# support. Downcast since values in range for int#
expected["u8"] = expected["u8"].astype(np.int8)
expected["u16"] = expected["u16"].astype(np.int16)
expected["u32"] = expected["u32"].astype(np.int32)
# No int64 supported at all. Downcast since values in range for int32
expected["u64"] = expected["u64"].astype(np.int32)
expected["i64"] = expected["i64"].astype(np.int32)

# GH 7369, make sure can read a 0-obs dta file
with tm.ensure_clean() as path:
empty_df_typed.to_stata(path, write_index=False, version=version)
empty_reread = read_stata(path)
tm.assert_frame_equal(expected, empty_reread)
tm.assert_series_equal(expected.dtypes, empty_reread.dtypes)

@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
def test_read_index_col_none(self, version):
df = DataFrame({"a": range(5), "b": ["b1", "b2", "b3", "b4", "b5"]})
Expand Down Expand Up @@ -2274,3 +2309,21 @@ def test_nullable_support(dtype, version):
tm.assert_series_equal(df.a, reread.a)
tm.assert_series_equal(reread.b, expected_b)
tm.assert_series_equal(reread.c, expected_c)


def test_empty_frame():
# GH 46240
# create an empty DataFrame with int64 and float64 dtypes
df = DataFrame(data={"a": range(3), "b": [1.0, 2.0, 3.0]}).head(0)
with tm.ensure_clean() as path:
df.to_stata(path, write_index=False, version=117)
# Read entire dataframe
df2 = read_stata(path)
assert "b" in df2
# Dtypes don't match since no support for int32
dtypes = Series({"a": np.dtype("int32"), "b": np.dtype("float64")})
tm.assert_series_equal(df2.dtypes, dtypes)
# read one column of empty .dta file
df3 = read_stata(path, columns=["a"])
assert "b" not in df3
tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]])

0 comments on commit d79f2d7

Please sign in to comment.