diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 33b7e128ef8bf..38727c2526bb5 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -293,6 +293,7 @@ Other API Changes - :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`) - :class:`Period` is now immutable, and will now raise an ``AttributeError`` when a user tries to assign a new value to the ``ordinal`` or ``freq`` attributes (:issue:`17116`). - :func:`to_datetime` when passed a tz-aware ``origin=`` kwarg will now raise a more informative ``ValueError`` rather than a ``TypeError`` (:issue:`16842`) +- Renamed non-functional ``index`` to ``index_col`` in :func:`read_stata` to improve API consistency (:issue:`16342`) .. _whatsnew_0210.deprecations: @@ -370,6 +371,7 @@ I/O - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`). - Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`) - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) +- Bug in :func:`read_stata` where the index was not set (:issue:`16342`) - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) Plotting diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 253ed03c25db9..b303b1000ff4d 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -9,31 +9,30 @@ You can find more information on http://presbrey.mit.edu/PyDTA and http://www.statsmodels.org/devel/ """ -import numpy as np -import sys +import datetime import struct -from dateutil.relativedelta import relativedelta +import sys -from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_dtype, - _ensure_object) +import numpy as np +from dateutil.relativedelta import relativedelta +from pandas._libs.lib import max_len_string_array, infer_dtype +from pandas._libs.tslib import NaT, Timestamp +import pandas as pd +from pandas import compat, to_timedelta, to_datetime, isna, DatetimeIndex +from pandas.compat import (lrange, lmap, lzip, text_type, string_types, range, + zip, BytesIO) from pandas.core.base import StringMixin from pandas.core.categorical import Categorical +from pandas.core.dtypes.common import (is_categorical_dtype, _ensure_object, + is_datetime64_dtype) from pandas.core.frame import DataFrame from pandas.core.series import Series -import datetime -from pandas import compat, to_timedelta, to_datetime, isna, DatetimeIndex -from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \ - zip, BytesIO -from pandas.util._decorators import Appender -import pandas as pd - from pandas.io.common import (get_filepath_or_buffer, BaseIterator, _stringify_path) -from pandas._libs.lib import max_len_string_array, infer_dtype -from pandas._libs.tslib import NaT, Timestamp +from pandas.util._decorators import Appender +from pandas.util._decorators import deprecate_kwarg VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1', 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1') @@ -53,8 +52,8 @@ Encoding used to parse the files. None defaults to latin-1.""" _statafile_processing_params2 = """\ -index : identifier of index column - identifier of column that should be used as index of the DataFrame +index_col : string, optional, default: None + Column to set as index convert_missing : boolean, defaults to False Flag indicating whether to convert missing values to their Stata representations. If False, missing values are replaced with nans. @@ -159,15 +158,16 @@ @Appender(_read_stata_doc) +@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') def read_stata(filepath_or_buffer, convert_dates=True, - convert_categoricals=True, encoding=None, index=None, + convert_categoricals=True, encoding=None, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, chunksize=None, iterator=False): reader = StataReader(filepath_or_buffer, convert_dates=convert_dates, convert_categoricals=convert_categoricals, - index=index, convert_missing=convert_missing, + index_col=index_col, convert_missing=convert_missing, preserve_dtypes=preserve_dtypes, columns=columns, order_categoricals=order_categoricals, @@ -944,8 +944,9 @@ def __init__(self, encoding): class StataReader(StataParser, BaseIterator): __doc__ = _stata_reader_doc + @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') def __init__(self, path_or_buf, convert_dates=True, - convert_categoricals=True, index=None, + convert_categoricals=True, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, encoding='latin-1', chunksize=None): @@ -956,7 +957,7 @@ def __init__(self, path_or_buf, convert_dates=True, # calls to read). self._convert_dates = convert_dates self._convert_categoricals = convert_categoricals - self._index = index + self._index_col = index_col self._convert_missing = convert_missing self._preserve_dtypes = preserve_dtypes self._columns = columns @@ -1460,8 +1461,9 @@ def get_chunk(self, size=None): return self.read(nrows=size) @Appender(_read_method_doc) + @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') def read(self, nrows=None, convert_dates=None, - convert_categoricals=None, index=None, + convert_categoricals=None, index_col=None, convert_missing=None, preserve_dtypes=None, columns=None, order_categoricals=None): # Handle empty file or chunk. If reading incrementally raise @@ -1486,6 +1488,8 @@ def read(self, nrows=None, convert_dates=None, columns = self._columns if order_categoricals is None: order_categoricals = self._order_categoricals + if index_col is None: + index_col = self._index_col if nrows is None: nrows = self.nobs @@ -1524,14 +1528,14 @@ def read(self, nrows=None, convert_dates=None, self._read_value_labels() if len(data) == 0: - data = DataFrame(columns=self.varlist, index=index) + data = DataFrame(columns=self.varlist) else: - data = DataFrame.from_records(data, index=index) + data = DataFrame.from_records(data) data.columns = self.varlist # If index is not specified, use actual row number rather than # restarting at 0 for each chunk. - if index is None: + if index_col is None: ix = np.arange(self._lines_read - read_lines, self._lines_read) data = data.set_index(ix) @@ -1553,7 +1557,7 @@ def read(self, nrows=None, convert_dates=None, cols_ = np.where(self.dtyplist)[0] # Convert columns (if needed) to match input type - index = data.index + ix = data.index requires_type_conversion = False data_formatted = [] for i in cols_: @@ -1563,7 +1567,7 @@ def read(self, nrows=None, convert_dates=None, if dtype != np.dtype(object) and dtype != self.dtyplist[i]: requires_type_conversion = True data_formatted.append( - (col, Series(data[col], index, self.dtyplist[i]))) + (col, Series(data[col], ix, self.dtyplist[i]))) else: data_formatted.append((col, data[col])) if requires_type_conversion: @@ -1606,6 +1610,9 @@ def read(self, nrows=None, convert_dates=None, if convert: data = DataFrame.from_items(retyped_data) + if index_col is not None: + data = data.set_index(data.pop(index_col)) + return data def _do_convert_missing(self, data, convert_missing): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a414928d318c4..94a0ac31e093e 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -476,7 +476,7 @@ def test_read_write_reread_dta15(self): tm.assert_frame_equal(parsed_114, parsed_117) def test_timestamp_and_label(self): - original = DataFrame([(1,)], columns=['var']) + original = DataFrame([(1,)], columns=['variable']) time_stamp = datetime(2000, 2, 29, 14, 21) data_label = 'This is a data file.' with tm.ensure_clean() as path: @@ -1309,3 +1309,12 @@ def test_value_labels_iterator(self, write_index): dta_iter = pd.read_stata(path, iterator=True) value_labels = dta_iter.value_labels() assert value_labels == {'A': {0: 'A', 1: 'B', 2: 'C', 3: 'E'}} + + def test_set_index(self): + # GH 17328 + df = tm.makeDataFrame() + df.index.name = 'index' + with tm.ensure_clean() as path: + df.to_stata(path) + reread = pd.read_stata(path, index_col='index') + tm.assert_frame_equal(df, reread)