From 3896e5eb2f554cb7dcae6ba785c187d9f6ae3fd3 Mon Sep 17 00:00:00 2001 From: Mortada Mehyar Date: Sat, 16 May 2015 16:42:20 -0700 Subject: [PATCH 01/26] BUG: mean overflows for integer dtypes (fixes #10155) --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/core/nanops.py | 23 ++++++++++++++++------- pandas/tests/test_nanops.py | 28 +++++++++++++++++++++++++++- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 1cff74d41f686..6f04b0358394f 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -63,6 +63,7 @@ Bug Fixes - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`) +- Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`) - Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`) - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`) - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e921a9d562bc1..0df160618b7c3 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -20,7 +20,7 @@ is_complex_dtype, is_integer_dtype, is_bool_dtype, is_object_dtype, is_datetime64_dtype, is_timedelta64_dtype, - is_datetime_or_timedelta_dtype, + is_datetime_or_timedelta_dtype, _get_dtype, is_int_or_datetime_dtype, is_any_int_dtype) @@ -254,8 +254,16 @@ def nansum(values, axis=None, skipna=True): @bottleneck_switch() def nanmean(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) - the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_max)) - count = _get_counts(mask, axis) + + dtype_sum = dtype_max + dtype_count = np.float64 + if is_integer_dtype(dtype): + dtype_sum = np.float64 + elif is_float_dtype(dtype): + dtype_sum = dtype + dtype_count = dtype + count = _get_counts(mask, axis, dtype=dtype_count) + the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, 'ndim', False): the_mean = the_sum / count @@ -557,15 +565,16 @@ def _maybe_arg_null_out(result, axis, mask, skipna): return result -def _get_counts(mask, axis): +def _get_counts(mask, axis, dtype=float): + dtype = _get_dtype(dtype) if axis is None: - return float(mask.size - mask.sum()) + return dtype.type(mask.size - mask.sum()) count = mask.shape[axis] - mask.sum(axis) try: - return count.astype(float) + return count.astype(dtype) except AttributeError: - return np.array(count, dtype=float) + return np.array(count, dtype=dtype) def _maybe_null_out(result, axis, mask): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 2a605cba8a6c0..1adb8a5d9217c 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -5,7 +5,7 @@ import numpy as np -from pandas.core.common import isnull +from pandas.core.common import isnull, is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm @@ -323,6 +323,32 @@ def test_nanmean(self): allow_complex=False, allow_obj=False, allow_str=False, allow_date=False, allow_tdelta=True) + def test_nanmean_overflow(self): + # GH 10155 + # In the previous implementation mean can overflow for int dtypes, it + # is now consistent with numpy + from pandas import Series + + # numpy < 1.9.0 is not computing this correctly + from distutils.version import LooseVersion + if LooseVersion(np.__version__) >= '1.9.0': + for a in [2 ** 55, -2 ** 55, 20150515061816532]: + s = Series(a, index=range(500), dtype=np.int64) + result = s.mean() + np_result = s.values.mean() + self.assertEqual(result, a) + self.assertEqual(result, np_result) + self.assertTrue(result.dtype == np.float64) + + # check returned dtype + for dtype in [np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]: + s = Series(range(10), dtype=dtype) + result = s.mean() + if is_integer_dtype(dtype): + self.assertTrue(result.dtype == np.float64) + else: + self.assertTrue(result.dtype == dtype) + def test_nanmedian(self): self.check_funs(nanops.nanmedian, np.median, allow_complex=False, allow_str=False, allow_date=False, From 393ce8b01e1557ffce36986c695cc33e9f36ce1c Mon Sep 17 00:00:00 2001 From: Jake VanderPlas Date: Fri, 22 May 2015 13:54:31 -0700 Subject: [PATCH 02/26] DOC: minor doc fix for Series.append; indices can overlap --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8ef9adb1d24a4..6367fb4fe0396 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1442,7 +1442,7 @@ def searchsorted(self, v, side='left', sorter=None): def append(self, to_append, verify_integrity=False): """ - Concatenate two or more Series. The indexes must not overlap + Concatenate two or more Series. Parameters ---------- From 8756a50129bb8bff1edafcbd57a348c2fbc3e3f7 Mon Sep 17 00:00:00 2001 From: Robin Wilson Date: Sat, 23 May 2015 14:17:31 +0100 Subject: [PATCH 03/26] DOC: Updated to mention axis='index' and axis='columns Updated docs throughout DataFrame methods to mention that axis can be set to 'index' or 'column' instead of 0 or 1 which improves readability significantly. --- pandas/core/frame.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ed01323eb9a27..f36108262432d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2745,7 +2745,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, Parameters ---------- - axis : {0, 1}, or tuple/list thereof + axis : {0 or 'index', 1 or 'columns'}, or tuple/list thereof Pass tuple or list to drop on multiple axes how : {'any', 'all'} * any : if any NA values are present, drop that label @@ -2890,7 +2890,7 @@ def sort(self, columns=None, axis=0, ascending=True, ascending : boolean or list, default True Sort ascending vs. descending. Specify list for multiple sort orders - axis : {0, 1} + axis : {0 or 'index', 1 or 'columns'}, default 0 Sort index/rows versus columns inplace : boolean, default False Sort the DataFrame without creating a new instance @@ -2919,7 +2919,7 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False, Parameters ---------- - axis : {0, 1} + axis : {0 or 'index', 1 or 'columns'}, default 0 Sort index/rows versus columns by : object Column name(s) in frame. Accepts a column name or a list @@ -3027,7 +3027,7 @@ def sortlevel(self, level=0, axis=0, ascending=True, Parameters ---------- level : int - axis : {0, 1} + axis : {0 or 'index', 1 or 'columns'}, default 0 ascending : boolean, default True inplace : boolean, default False Sort the DataFrame without creating a new instance @@ -3639,9 +3639,9 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, ---------- func : function Function to apply to each column/row - axis : {0, 1} - * 0 : apply function to each column - * 1 : apply function to each row + axis : {0 or 'index', 1 or 'columns'}, default 0 + * 0 or 'index': apply function to each column + * 1 or 'columns': apply function to each row broadcast : boolean, default False For aggregation functions, return object of same size with values propagated @@ -4162,8 +4162,8 @@ def corrwith(self, other, axis=0, drop=False): Parameters ---------- other : DataFrame - axis : {0, 1} - 0 to compute column-wise, 1 for row-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise drop : boolean, default False Drop missing indices from result, default returns union of all @@ -4214,8 +4214,8 @@ def count(self, axis=0, level=None, numeric_only=False): Parameters ---------- - axis : {0, 1} - 0 for row-wise, 1 for column-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a DataFrame @@ -4368,8 +4368,8 @@ def idxmin(self, axis=0, skipna=True): Parameters ---------- - axis : {0, 1} - 0 for row-wise, 1 for column-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA @@ -4399,8 +4399,8 @@ def idxmax(self, axis=0, skipna=True): Parameters ---------- - axis : {0, 1} - 0 for row-wise, 1 for column-wise + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be first index. @@ -4446,9 +4446,9 @@ def mode(self, axis=0, numeric_only=False): Parameters ---------- - axis : {0, 1, 'index', 'columns'} (default 0) - * 0/'index' : get mode of each column - * 1/'columns' : get mode of each row + axis : {0 or 'index', 1 or 'columns'}, default 0 + * 0 or 'index' : get mode of each column + * 1 or 'columns' : get mode of each row numeric_only : boolean, default False if True, only apply to numeric columns @@ -4553,7 +4553,7 @@ def rank(self, axis=0, numeric_only=None, method='average', Parameters ---------- - axis : {0, 1}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 Ranks over columns (0) or rows (1) numeric_only : boolean, default None Include only float, int, boolean data @@ -4605,7 +4605,7 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True): how : {'s', 'e', 'start', 'end'} Convention for converting period to timestamp; start of period vs. end - axis : {0, 1} default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default) copy : boolean, default True If false then underlying input data is not copied @@ -4636,7 +4636,7 @@ def to_period(self, freq=None, axis=0, copy=True): Parameters ---------- freq : string, default - axis : {0, 1}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to convert (the index by default) copy : boolean, default True If False then underlying input data is not copied From df2cffc0a7cbaa964892e6a1b0363e8c194a57f3 Mon Sep 17 00:00:00 2001 From: Robin Wilson Date: Mon, 25 May 2015 18:59:01 +0100 Subject: [PATCH 04/26] DOC: Added nlargest/nsmallest to API docs Fixes #10145 --- doc/source/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/api.rst b/doc/source/api.rst index 3b2e8b65768bb..f5ba03afc9f19 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -358,6 +358,8 @@ Computations / Descriptive Stats Series.median Series.min Series.mode + Series.nlargest + Series.nsmallest Series.pct_change Series.prod Series.quantile From 935d2495b288a24730efda98d2bad941244a0bc0 Mon Sep 17 00:00:00 2001 From: Tom Ajamian Date: Sat, 23 May 2015 14:31:34 -0400 Subject: [PATCH 05/26] BUG: closes issue #7212 - side effect on passed columns list --- doc/source/whatsnew/v0.17.0.txt | 2 ++ pandas/io/pytables.py | 4 ++++ pandas/io/tests/test_pytables.py | 23 +++++++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 1cff74d41f686..d7955d7210ade 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -60,6 +60,8 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug where read_hdf store.select modifies the passed columns list when + multi-indexed (:issue:`7212`) - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 458a245da6bdb..4cbc7aeaa3df7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3453,6 +3453,10 @@ def get_blk_items(mgr, blocks): def process_axes(self, obj, columns=None): """ process axes filters """ + # make a copy to avoid side effects + if columns is not None: + columns = list(columns) + # make sure to include levels if we have them if columns is not None and self.is_multi_index: for n in self.levels: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 6cfd569904097..7d9c3c051344f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4617,6 +4617,29 @@ def test_preserve_timedeltaindex_type(self): store['df'] = df assert_frame_equal(store['df'], df) + def test_colums_multiindex_modified(self): + # BUG: 7212 + # read_hdf store.select modified the passed columns parameters + # when multi-indexed. + + df = DataFrame(np.random.rand(4, 5), + index=list('abcd'), + columns=list('ABCDE')) + df.index.name = 'letters' + df = df.set_index(keys='E', append=True) + + data_columns = df.index.names+df.columns.tolist() + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', + mode='a', + append=True, + data_columns=data_columns, + index=False) + cols2load = list('BCD') + cols2load_original = list(cols2load) + df_loaded = read_hdf(path, 'df', columns=cols2load) + self.assertTrue(cols2load_original == cols2load) + def _test_sort(obj): if isinstance(obj, DataFrame): From 3518e3576eaf5c450d11e7e3c444521977a47ef5 Mon Sep 17 00:00:00 2001 From: Morton Fox Date: Tue, 26 May 2015 12:32:46 -0400 Subject: [PATCH 06/26] Update bq link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c76fbe7df9e6b..8623ee170d154 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ conda install pandas - xlrd >= 0.9.0 - [XlsxWriter](https://pypi.python.org/pypi/XlsxWriter) - Alternative Excel writer. -- [Google bq Command Line Tool](https://developers.google.com/bigquery/bq-command-line-tool/) +- [Google bq Command Line Tool](https://cloud.google.com/bigquery/bq-command-line-tool) - Needed for `pandas.io.gbq` - [boto](https://pypi.python.org/pypi/boto): necessary for Amazon S3 access. - One of the following combinations of libraries is needed to use the From 5df497e6d87de256127ce40bd8ec05a4b76e4f33 Mon Sep 17 00:00:00 2001 From: Artemy Kolchinsky Date: Tue, 26 May 2015 19:07:35 -0400 Subject: [PATCH 07/26] BUG: plot doesnt default to matplotlib axes.grid setting (#9792) Cleanup --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/tests/test_graphics.py | 44 +++++++++++++++++++++++++++++++++ pandas/tools/plotting.py | 2 +- 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index d7955d7210ade..a7917e81f7057 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -76,5 +76,6 @@ Bug Fixes - Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`) - Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`) +- Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 4c9d5a9207dd7..82f4b8c05ca06 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -439,6 +439,38 @@ def _check_box_return_type(self, returned, return_type, expected_keys=None, else: raise AssertionError + def _check_grid_settings(self, obj, kinds, kws={}): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + + import matplotlib as mpl + + def is_grid_on(): + xoff = all(not g.gridOn for g in self.plt.gca().xaxis.get_major_ticks()) + yoff = all(not g.gridOn for g in self.plt.gca().yaxis.get_major_ticks()) + return not(xoff and yoff) + + spndx=1 + for kind in kinds: + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=False) + obj.plot(kind=kind, **kws) + self.assertFalse(is_grid_on()) + + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=True) + obj.plot(kind=kind, grid=False, **kws) + self.assertFalse(is_grid_on()) + + if kind != 'pie': + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=True) + obj.plot(kind=kind, **kws) + self.assertTrue(is_grid_on()) + + self.plt.subplot(1,4*len(kinds),spndx); spndx+=1 + mpl.rc('axes',grid=False) + obj.plot(kind=kind, grid=True, **kws) + self.assertTrue(is_grid_on()) @tm.mplskip class TestSeriesPlots(TestPlotBase): @@ -1108,6 +1140,12 @@ def test_table(self): _check_plot_works(self.series.plot, table=True) _check_plot_works(self.series.plot, table=self.series) + @slow + def test_series_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + self._check_grid_settings(Series([1,2,3]), + plotting._series_kinds + plotting._common_kinds) + @tm.mplskip class TestDataFramePlots(TestPlotBase): @@ -3426,6 +3464,12 @@ def test_sharey_and_ax(self): "y label is invisible but shouldn't") + @slow + def test_df_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + self._check_grid_settings(DataFrame({'a':[1,2,3],'b':[2,3,4]}), + plotting._dataframe_kinds, kws={'x':'a','y':'b'}) + @tm.mplskip class TestDataFrameGroupByPlots(TestPlotBase): diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 04dd4d3395684..76685e2589012 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -810,7 +810,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self.rot = self._default_rot if grid is None: - grid = False if secondary_y else True + grid = False if secondary_y else self.plt.rcParams['axes.grid'] self.grid = grid self.legend = legend From cfe4d2606f13eb5f5f2d9523b32a088763746beb Mon Sep 17 00:00:00 2001 From: Thomas Grainger Date: Wed, 27 May 2015 17:09:28 +0100 Subject: [PATCH 08/26] Close mysql connection in TestXMySQL to prevent tests freezing --- pandas/io/tests/test_sql.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index fa7debeb228ce..9576f80696350 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -2195,6 +2195,13 @@ def setUp(self): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") + def tearDown(self): + from pymysql.err import Error + try: + self.db.close() + except Error: + pass + def test_basic(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() From 0ad0e0c67652e64fa081bcb75d468e4e69f9b98f Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 29 Apr 2014 01:11:23 +0900 Subject: [PATCH 09/26] BUG: GroupBy.get_group raises ValueError when group key contains NaT --- doc/source/groupby.rst | 8 +++--- doc/source/whatsnew/v0.17.0.txt | 6 +++++ pandas/core/groupby.py | 6 ++++- pandas/src/generate_code.py | 4 ++- pandas/src/generated.pyx | 14 ++++++----- pandas/tests/test_groupby.py | 44 ++++++++++++++++++++++++++++++++- pandas/tests/test_index.py | 19 ++++++++++++++ 7 files changed, 88 insertions(+), 13 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 7ad2641dec52a..c9e18b585c764 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -784,11 +784,11 @@ will be (silently) dropped. Thus, this does not pose any problems: df.groupby('A').std() -NA group handling -~~~~~~~~~~~~~~~~~ +NA and NaT group handling +~~~~~~~~~~~~~~~~~~~~~~~~~ -If there are any NaN values in the grouping key, these will be automatically -excluded. So there will never be an "NA group". This was not the case in older +If there are any NaN or NaT values in the grouping key, these will be automatically +excluded. So there will never be an "NA group" or "NaT group". This was not the case in older versions of pandas, but users were generally discarding the NA group anyway (and supporting it was an implementation headache). diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index a7917e81f7057..5d4d149798d21 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -69,13 +69,19 @@ Bug Fixes - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`) - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`) + - Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`) - Bug in display datetimes with mixed frequencies uniformly; display 'ms' datetimes to the proper precision. (:issue:`10170`) + - Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`) + - Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`) - Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`) +- Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`) + + diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ffc3e6a08221c..51674bad60f5b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -426,7 +426,11 @@ def convert(key, s): return Timestamp(key).asm8 return key - sample = next(iter(self.indices)) + if len(self.indices) > 0: + sample = next(iter(self.indices)) + else: + sample = None # Dummy sample + if isinstance(sample, tuple): if not isinstance(name, tuple): msg = ("must supply a tuple to get_group with multiple" diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index a0cdc0ff5e841..598cdff30e4f7 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -37,6 +37,8 @@ cimport util from util cimport is_array, _checknull, _checknan, get_nat +cimport lib +from lib cimport is_null_datetimelike cdef int64_t iNaT = get_nat() @@ -673,7 +675,7 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 79722a26ebedc..428decd4dca10 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -28,6 +28,8 @@ ctypedef unsigned char UChar cimport util from util cimport is_array, _checknull, _checknan, get_nat +cimport lib +from lib cimport is_null_datetimelike cdef int64_t iNaT = get_nat() @@ -2096,7 +2098,7 @@ def groupby_float64(ndarray[float64_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2124,7 +2126,7 @@ def groupby_float32(ndarray[float32_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2152,7 +2154,7 @@ def groupby_object(ndarray[object] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2180,7 +2182,7 @@ def groupby_int32(ndarray[int32_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2208,7 +2210,7 @@ def groupby_int64(ndarray[int64_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2236,7 +2238,7 @@ def groupby_bool(ndarray[uint8_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c308308603167..0789e20df3945 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -699,7 +699,6 @@ def test_get_group(self): expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1]) assert_panel_equal(gp, expected) - # GH 5267 # be datelike friendly df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', @@ -2837,6 +2836,49 @@ def test_groupby_list_infer_array_like(self): result = df.groupby(['foo', 'bar']).mean() expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + def test_groupby_nat_exclude(self): + # GH 6992 + df = pd.DataFrame({'values': np.random.randn(8), + 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp('2013-02-01'), + np.nan, pd.Timestamp('2013-02-01'), np.nan, pd.Timestamp('2013-01-01')], + 'str': [np.nan, 'a', np.nan, 'a', + np.nan, 'a', np.nan, 'b']}) + grouped = df.groupby('dt') + + expected = [[1, 7], [3, 5]] + keys = sorted(grouped.groups.keys()) + self.assertEqual(len(keys), 2) + for k, e in zip(keys, expected): + # grouped.groups keys are np.datetime64 with system tz + # not to be affected by tz, only compare values + self.assertEqual(grouped.groups[k], e) + + # confirm obj is not filtered + tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + self.assertEqual(grouped.ngroups, 2) + expected = {Timestamp('2013-01-01 00:00:00'): np.array([1, 7]), + Timestamp('2013-02-01 00:00:00'): np.array([3, 5])} + for k in grouped.indices: + self.assert_numpy_array_equal(grouped.indices[k], expected[k]) + + tm.assert_frame_equal(grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) + tm.assert_frame_equal(grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + + self.assertRaises(KeyError, grouped.get_group, pd.NaT) + + nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], + 'nat': [pd.NaT, pd.NaT, pd.NaT]}) + self.assertEqual(nan_df['nan'].dtype, 'float64') + self.assertEqual(nan_df['nat'].dtype, 'datetime64[ns]') + + for key in ['nan', 'nat']: + grouped = nan_df.groupby(key) + self.assertEqual(grouped.groups, {}) + self.assertEqual(grouped.ngroups, 0) + self.assertEqual(grouped.indices, {}) + self.assertRaises(KeyError, grouped.get_group, np.nan) + self.assertRaises(KeyError, grouped.get_group, pd.NaT) + def test_dictify(self): dict(iter(self.df.groupby('A'))) dict(iter(self.df.groupby(['A', 'B']))) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 444aa2a0bab1e..93299292cf353 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1858,6 +1858,25 @@ def test_ufunc_compat(self): expected = Float64Index(np.sin(np.arange(5,dtype='int64'))) tm.assert_index_equal(result, expected) + def test_index_groupby(self): + int_idx = Index(range(6)) + float_idx = Index(np.arange(0, 0.6, 0.1)) + obj_idx = Index('A B C D E F'.split()) + dt_idx = pd.date_range('2013-01-01', freq='M', periods=6) + + for idx in [int_idx, float_idx, obj_idx, dt_idx]: + to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1]) + self.assertEqual(idx.groupby(to_groupby), + {1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]}) + + to_groupby = Index([datetime(2011, 11, 1), datetime(2011, 12, 1), + pd.NaT, pd.NaT, + datetime(2011, 12, 1), datetime(2011, 11, 1)], tz='UTC').values + + ex_keys = pd.tslib.datetime_to_datetime64(np.array([Timestamp('2011-11-01'), Timestamp('2011-12-01')])) + expected = {ex_keys[0][0]: [idx[0], idx[5]], ex_keys[0][1]: [idx[1], idx[4]]} + self.assertEqual(idx.groupby(to_groupby), expected) + class TestFloat64Index(Numeric, tm.TestCase): _holder = Float64Index From 186b20d66e3a3e548382f7edf8678dde045ee245 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 5 May 2015 12:34:38 +0900 Subject: [PATCH 10/26] BUG: Series.align resets name when fill_value is specified --- doc/source/whatsnew/v0.17.0.txt | 5 +++++ pandas/core/generic.py | 21 ++++++++------------- pandas/tests/test_series.py | 7 +++++++ 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index a7917e81f7057..1658d79ae3532 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -73,9 +73,14 @@ Bug Fixes - Bug in display datetimes with mixed frequencies uniformly; display 'ms' datetimes to the proper precision. (:issue:`10170`) + - Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`) - Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`) + - Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`) +- Bug in ``Series.align`` resets ``name`` when ``fill_value`` is specified (:issue:`10067`) + + diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b747f0a2ceacb..d6c7d87bb25b1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3365,11 +3365,10 @@ def _align_series(self, other, join='outer', axis=None, level=None, level=level, return_indexers=True) - left_result = self._reindex_indexer(join_index, lidx, copy) - right_result = other._reindex_indexer(join_index, ridx, copy) + left = self._reindex_indexer(join_index, lidx, copy) + right = other._reindex_indexer(join_index, ridx, copy) else: - # one has > 1 ndim fdata = self._data if axis == 0: @@ -3399,23 +3398,19 @@ def _align_series(self, other, join='outer', axis=None, level=None, if copy and fdata is self._data: fdata = fdata.copy() - left_result = DataFrame(fdata) + left = DataFrame(fdata) if ridx is None: - right_result = other + right = other else: - right_result = other.reindex(join_index, level=level) + right = other.reindex(join_index, level=level) # fill fill_na = notnull(fill_value) or (method is not None) if fill_na: - return (left_result.fillna(fill_value, method=method, limit=limit, - axis=fill_axis), - right_result.fillna(fill_value, method=method, - limit=limit)) - else: - return (left_result.__finalize__(self), - right_result.__finalize__(other)) + left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis) + right = right.fillna(fill_value, method=method, limit=limit) + return (left.__finalize__(self), right.__finalize__(other)) _shared_docs['where'] = (""" Return an object of same shape as self and whose corresponding diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 925cfa875196c..5a5b5fa2b226b 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -5932,6 +5932,10 @@ def _check_align(a, b, how='left', fill=None): assert_series_equal(aa, ea) assert_series_equal(ab, eb) + self.assertEqual(aa.name, 'ts') + self.assertEqual(ea.name, 'ts') + self.assertEqual(ab.name, 'ts') + self.assertEqual(eb.name, 'ts') for kind in JOIN_TYPES: _check_align(self.ts[2:], self.ts[:-5], how=kind) @@ -5939,12 +5943,15 @@ def _check_align(a, b, how='left', fill=None): # empty left _check_align(self.ts[:0], self.ts[:-5], how=kind) + _check_align(self.ts[:0], self.ts[:-5], how=kind, fill=-1) # empty right _check_align(self.ts[:-5], self.ts[:0], how=kind) + _check_align(self.ts[:-5], self.ts[:0], how=kind, fill=-1) # both empty _check_align(self.ts[:0], self.ts[:0], how=kind) + _check_align(self.ts[:0], self.ts[:0], how=kind, fill=-1) def test_align_fill_method(self): def _check_align(a, b, how='left', method='pad', limit=None): From 1400bfa2b4347914f56080c7bb4e388a02cf340c Mon Sep 17 00:00:00 2001 From: austinc Date: Thu, 5 Feb 2015 18:37:24 -0500 Subject: [PATCH 11/26] ENH: Don't infer WOM-5MON if we don't support it (#9425) --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/tseries/frequencies.py | 4 +++- pandas/tseries/tests/test_frequencies.py | 10 ++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 9b87c6c1332ab..93c34898a394d 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -89,3 +89,4 @@ Bug Fixes - Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`) +- Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index d0d71c63183fa..4af8c68110978 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -935,7 +935,9 @@ def _get_wom_rule(self): return None week_of_months = unique((self.index.day - 1) // 7) - if len(week_of_months) > 1: + # Only attempt to infer up to WOM-4. See #9425 + week_of_months = week_of_months[week_of_months < 4] + if len(week_of_months) == 0 or len(week_of_months) > 1: return None # get which week diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 2f2d249539b81..823c762c692e5 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -212,6 +212,16 @@ def test_week_of_month(self): for i in range(1, 5): self._check_generated_range('1/1/2000', 'WOM-%d%s' % (i, day)) + def test_fifth_week_of_month(self): + # Only supports freq up to WOM-4. See #9425 + func = lambda: date_range('2014-01-01', freq='WOM-5MON') + self.assertRaises(ValueError, func) + + def test_fifth_week_of_month_infer(self): + # Only attempts to infer up to WOM-4. See #9425 + index = DatetimeIndex(["2014-03-31", "2014-06-30", "2015-03-30"]) + assert frequencies.infer_freq(index) is None + def test_week_of_month_fake(self): #All of these dates are on same day of week and are 4 or 5 weeks apart index = DatetimeIndex(["2013-08-27","2013-10-01","2013-10-29","2013-11-26"]) From a7a02f45fd459f19e2e5b4553d42578e94fd2f54 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 31 May 2015 05:27:09 +0900 Subject: [PATCH 12/26] BUG: Series arithmetic methods incorrectly hold name --- doc/source/whatsnew/v0.17.0.txt | 2 +- pandas/core/series.py | 7 ++++++- pandas/tests/test_series.py | 19 ++++++++++++++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 9b87c6c1332ab..842d7cf84e05a 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -73,7 +73,7 @@ Bug Fixes - Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`) - Bug in display datetimes with mixed frequencies uniformly; display 'ms' datetimes to the proper precision. (:issue:`10170`) - +- Bung in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`) - Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6367fb4fe0396..c54bd96f64c73 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1508,7 +1508,12 @@ def _binop(self, other, func, level=None, fill_value=None): result = func(this_vals, other_vals) name = _maybe_match_name(self, other) - return self._constructor(result, index=new_index).__finalize__(self) + result = self._constructor(result, index=new_index, name=name) + result = result.__finalize__(self) + if name is None: + # When name is None, __finalize__ overwrites current name + result.name = None + return result def combine(self, other, func, fill_value=nan): """ diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 5a5b5fa2b226b..bbe942e607faf 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -264,10 +264,11 @@ def test_tab_completion(self): self.assertTrue('dt' not in dir(s)) def test_binop_maybe_preserve_name(self): - # names match, preserve result = self.ts * self.ts self.assertEqual(result.name, self.ts.name) + result = self.ts.mul(self.ts) + self.assertEqual(result.name, self.ts.name) result = self.ts * self.ts[:-2] self.assertEqual(result.name, self.ts.name) @@ -277,6 +278,22 @@ def test_binop_maybe_preserve_name(self): cp.name = 'something else' result = self.ts + cp self.assertIsNone(result.name) + result = self.ts.add(cp) + self.assertIsNone(result.name) + + ops = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', 'pow'] + ops = ops + ['r' + op for op in ops] + for op in ops: + # names match, preserve + s = self.ts.copy() + result = getattr(s, op)(s) + self.assertEqual(result.name, self.ts.name) + + # names don't match, don't preserve + cp = self.ts.copy() + cp.name = 'changed' + result = getattr(s, op)(cp) + self.assertIsNone(result.name) def test_combine_first_name(self): result = self.ts.combine_first(self.ts[:5]) From b66dda85ec84fe4da02c55f32f1747191e76d0df Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 31 May 2015 05:06:51 +0900 Subject: [PATCH 13/26] BUG: SparseSeries.abs() resets name --- doc/source/whatsnew/v0.17.0.txt | 2 +- pandas/sparse/series.py | 2 +- pandas/sparse/tests/test_sparse.py | 15 +++++++++++++++ pandas/tests/test_panel.py | 2 ++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 9b87c6c1332ab..dd151682339e1 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -83,7 +83,7 @@ Bug Fixes - Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`) - Bug in ``Series.align`` resets ``name`` when ``fill_value`` is specified (:issue:`10067`) - +- Bug in ``SparseSeries.abs`` resets ``name`` (:issue:`10241`) - Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 2c328e51b5090..f53cc66bee961 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -399,7 +399,7 @@ def abs(self): res_sp_values = np.abs(self.sp_values) return self._constructor(res_sp_values, index=self.index, sparse_index=self.sp_index, - fill_value=self.fill_value) + fill_value=self.fill_value).__finalize__(self) def get(self, label, default=None): """ diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index dd1d10f3d15ed..a7a78ba226a0b 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -509,6 +509,21 @@ def _check_inplace_op(iop, op): _check_inplace_op( getattr(operator, "i%s" % op), getattr(operator, op)) + def test_abs(self): + s = SparseSeries([1, 2, -3], name='x') + expected = SparseSeries([1, 2, 3], name='x') + result = s.abs() + assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + + result = abs(s) + assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + + result = np.abs(s) + assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + def test_reindex(self): def _compare_with_series(sps, new_index): spsre = sps.reindex(new_index) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index d7d83887298b1..57fd465993e14 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -404,6 +404,8 @@ def test_abs(self): expected = np.abs(s) assert_series_equal(result, expected) assert_series_equal(result2, expected) + self.assertEqual(result.name, 'A') + self.assertEqual(result2.name, 'A') class CheckIndexing(object): From 452ea5f80ca2c65ba7dc3457314e56f2c9d73b43 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 16 May 2015 05:55:11 +0900 Subject: [PATCH 14/26] BUG: Index.union cannot handle array-likes --- doc/source/whatsnew/v0.17.0.txt | 2 + pandas/core/index.py | 84 +++++++-------- pandas/tests/test_index.py | 184 +++++++++++++++++++++++++++++--- pandas/tseries/index.py | 2 + pandas/tseries/period.py | 2 + pandas/tseries/tdi.py | 7 +- 6 files changed, 220 insertions(+), 61 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index dae1342c3cd76..bebce2d3e2d87 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -67,6 +67,7 @@ Bug Fixes - Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`) - Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`) +- Bug in ``Index.union`` raises ``AttributeError`` when passing array-likes. (:issue:`10149`) - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`) - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`) @@ -91,3 +92,4 @@ Bug Fixes - Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`) + diff --git a/pandas/core/index.py b/pandas/core/index.py index de30fee4009f4..2bd96fcec2e42 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -580,8 +580,18 @@ def to_datetime(self, dayfirst=False): return DatetimeIndex(self.values) def _assert_can_do_setop(self, other): + if not com.is_list_like(other): + raise TypeError('Input must be Index or array-like') return True + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = self.name if self.name == other.name else None + return other, result_name + @property def nlevels(self): return 1 @@ -1364,16 +1374,14 @@ def union(self, other): ------- union : Index """ - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable.') + self._assert_can_do_setop(other) + other = _ensure_index(other) if len(other) == 0 or self.equals(other): return self if len(self) == 0: - return _ensure_index(other) - - self._assert_can_do_setop(other) + return other if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') @@ -1439,11 +1447,7 @@ def intersection(self, other): ------- intersection : Index """ - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable!') - self._assert_can_do_setop(other) - other = _ensure_index(other) if self.equals(other): @@ -1492,18 +1496,12 @@ def difference(self, other): >>> index.difference(index2) """ - - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable!') + self._assert_can_do_setop(other) if self.equals(other): return Index([], name=self.name) - if not isinstance(other, Index): - other = np.asarray(other) - result_name = self.name - else: - result_name = self.name if self.name == other.name else None + other, result_name = self._convert_can_do_setop(other) theDiff = sorted(set(self) - set(other)) return Index(theDiff, name=result_name) @@ -1517,7 +1515,7 @@ def sym_diff(self, other, result_name=None): Parameters ---------- - other : array-like + other : Index or array-like result_name : str Returns @@ -1545,13 +1543,10 @@ def sym_diff(self, other, result_name=None): >>> idx1 ^ idx2 Int64Index([1, 5], dtype='int64') """ - if not hasattr(other, '__iter__'): - raise TypeError('Input must be iterable!') - - if not isinstance(other, Index): - other = Index(other) - result_name = result_name or self.name - + self._assert_can_do_setop(other) + other, result_name_update = self._convert_can_do_setop(other) + if result_name is None: + result_name = result_name_update the_diff = sorted(set((self.difference(other)).union(other.difference(self)))) return Index(the_diff, name=result_name) @@ -5460,12 +5455,11 @@ def union(self, other): >>> index.union(index2) """ self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) if len(other) == 0 or self.equals(other): return self - result_names = self.names if self.names == other.names else None - uniq_tuples = lib.fast_unique_multiple([self.values, other.values]) return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) @@ -5483,12 +5477,11 @@ def intersection(self, other): Index """ self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) if self.equals(other): return self - result_names = self.names if self.names == other.names else None - self_tuples = self.values other_tuples = other.values uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) @@ -5509,18 +5502,10 @@ def difference(self, other): diff : MultiIndex """ self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) - if not isinstance(other, MultiIndex): - if len(other) == 0: + if len(other) == 0: return self - try: - other = MultiIndex.from_tuples(other) - except: - raise TypeError('other must be a MultiIndex or a list of' - ' tuples') - result_names = self.names - else: - result_names = self.names if self.names == other.names else None if self.equals(other): return MultiIndex(levels=[[]] * self.nlevels, @@ -5537,15 +5522,30 @@ def difference(self, other): return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) - def _assert_can_do_setop(self, other): - pass - def astype(self, dtype): if not is_object_dtype(np.dtype(dtype)): raise TypeError('Setting %s dtype to anything other than object ' 'is not supported' % self.__class__) return self._shallow_copy() + def _convert_can_do_setop(self, other): + result_names = self.names + + if not hasattr(other, 'names'): + if len(other) == 0: + other = MultiIndex(levels=[[]] * self.nlevels, + labels=[[]] * self.nlevels, + verify_integrity=False) + else: + msg = 'other must be a MultiIndex or a list of tuples' + try: + other = MultiIndex.from_tuples(other) + except: + raise TypeError(msg) + else: + result_names = self.names if self.names == other.names else None + return other, result_names + def insert(self, loc, item): """ Make new MultiIndex inserting new item at location diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 93299292cf353..ed84c9764dd84 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -251,6 +251,136 @@ def test_take(self): expected = ind[indexer] self.assertTrue(result.equals(expected)) + def test_setops_errorcases(self): + for name, idx in compat.iteritems(self.indices): + # # non-iterable input + cases = [0.5, 'xxx'] + methods = [idx.intersection, idx.union, idx.difference, idx.sym_diff] + + for method in methods: + for case in cases: + assertRaisesRegexp(TypeError, + "Input must be Index or array-like", + method, case) + + def test_intersection_base(self): + for name, idx in compat.iteritems(self.indices): + first = idx[:5] + second = idx[:3] + intersect = first.intersection(second) + + if isinstance(idx, CategoricalIndex): + pass + else: + self.assertTrue(tm.equalContents(intersect, second)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.intersection(case) + elif isinstance(idx, CategoricalIndex): + pass + else: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.intersection([1, 2, 3]) + + def test_union_base(self): + for name, idx in compat.iteritems(self.indices): + first = idx[3:] + second = idx[:5] + everything = idx + union = first.union(second) + self.assertTrue(tm.equalContents(union, everything)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.union(case) + elif isinstance(idx, CategoricalIndex): + pass + else: + result = first.union(case) + self.assertTrue(tm.equalContents(result, everything)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.union([1, 2, 3]) + + def test_difference_base(self): + for name, idx in compat.iteritems(self.indices): + first = idx[2:] + second = idx[:4] + answer = idx[4:] + result = first.difference(second) + + if isinstance(idx, CategoricalIndex): + pass + else: + self.assertTrue(tm.equalContents(result, answer)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.difference(case) + elif isinstance(idx, CategoricalIndex): + pass + elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): + self.assertEqual(result.__class__, answer.__class__) + self.assert_numpy_array_equal(result.asi8, answer.asi8) + else: + result = first.difference(case) + self.assertTrue(tm.equalContents(result, answer)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.difference([1, 2, 3]) + + def test_symmetric_diff(self): + for name, idx in compat.iteritems(self.indices): + first = idx[1:] + second = idx[:-1] + if isinstance(idx, CategoricalIndex): + pass + else: + answer = idx[[0, -1]] + result = first.sym_diff(second) + self.assertTrue(tm.equalContents(result, answer)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(idx, PeriodIndex): + msg = "can only call with other PeriodIndex-ed objects" + with tm.assertRaisesRegexp(ValueError, msg): + result = first.sym_diff(case) + elif isinstance(idx, CategoricalIndex): + pass + else: + result = first.sym_diff(case) + self.assertTrue(tm.equalContents(result, answer)) + + if isinstance(idx, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with tm.assertRaisesRegexp(TypeError, msg): + result = first.sym_diff([1, 2, 3]) + + class TestIndex(Base, tm.TestCase): _holder = Index _multiprocess_can_split_ = True @@ -620,16 +750,12 @@ def test_intersection(self): first = self.strIndex[:20] second = self.strIndex[:10] intersect = first.intersection(second) - self.assertTrue(tm.equalContents(intersect, second)) # Corner cases inter = first.intersection(first) self.assertIs(inter, first) - # non-iterable input - assertRaisesRegexp(TypeError, "iterable", first.intersection, 0.5) - idx1 = Index([1, 2, 3, 4, 5], name='idx') # if target has the same name, it is preserved idx2 = Index([3, 4, 5, 6, 7], name='idx') @@ -671,6 +797,12 @@ def test_union(self): union = first.union(second) self.assertTrue(tm.equalContents(union, everything)) + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.union(case) + self.assertTrue(tm.equalContents(result, everything)) + # Corner cases union = first.union(first) self.assertIs(union, first) @@ -681,9 +813,6 @@ def test_union(self): union = Index([]).union(first) self.assertIs(union, first) - # non-iterable input - assertRaisesRegexp(TypeError, "iterable", first.union, 0.5) - # preserve names first.name = 'A' second.name = 'A' @@ -792,11 +921,7 @@ def test_difference(self): self.assertEqual(len(result), 0) self.assertEqual(result.name, first.name) - # non-iterable input - assertRaisesRegexp(TypeError, "iterable", first.difference, 0.5) - def test_symmetric_diff(self): - # smoke idx1 = Index([1, 2, 3, 4], name='idx1') idx2 = Index([2, 3, 4, 5]) @@ -842,10 +967,6 @@ def test_symmetric_diff(self): self.assertTrue(tm.equalContents(result, expected)) self.assertEqual(result.name, 'new_name') - # other isn't iterable - with tm.assertRaises(TypeError): - Index(idx1,dtype='object').difference(1) - def test_is_numeric(self): self.assertFalse(self.dateIndex.is_numeric()) self.assertFalse(self.strIndex.is_numeric()) @@ -1786,6 +1907,7 @@ def test_equals(self): self.assertFalse(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca'))) self.assertTrue(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca') + [np.nan])) + class Numeric(Base): def test_numeric_compat(self): @@ -2661,6 +2783,36 @@ def test_time_overflow_for_32bit_machines(self): idx2 = pd.date_range(end='2000', periods=periods, freq='S') self.assertEqual(len(idx2), periods) + def test_intersection(self): + first = self.index + second = self.index[5:] + intersect = first.intersection(second) + self.assertTrue(tm.equalContents(intersect, second)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + third = Index(['a', 'b', 'c']) + result = first.intersection(third) + expected = pd.Index([], dtype=object) + self.assert_index_equal(result, expected) + + def test_union(self): + first = self.index[:5] + second = self.index[5:] + everything = self.index + union = first.union(second) + self.assertTrue(tm.equalContents(union, everything)) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.union(case) + self.assertTrue(tm.equalContents(result, everything)) + class TestPeriodIndex(DatetimeLike, tm.TestCase): _holder = PeriodIndex @@ -2671,7 +2823,7 @@ def setUp(self): self.setup_indices() def create_index(self): - return period_range('20130101',periods=5,freq='D') + return period_range('20130101', periods=5, freq='D') def test_pickle_compat_construction(self): pass diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index bd0869b9525b7..745c536914e47 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -804,6 +804,7 @@ def union(self, other): ------- y : Index or DatetimeIndex """ + self._assert_can_do_setop(other) if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) @@ -1039,6 +1040,7 @@ def intersection(self, other): ------- y : Index or DatetimeIndex """ + self._assert_can_do_setop(other) if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 510887a185054..6627047f0c335 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -679,6 +679,8 @@ def join(self, other, how='left', level=None, return_indexers=False): return self._apply_meta(result) def _assert_can_do_setop(self, other): + super(PeriodIndex, self)._assert_can_do_setop(other) + if not isinstance(other, PeriodIndex): raise ValueError('can only call with other PeriodIndex-ed objects') diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 1443c22909689..de68dd763d68c 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -436,12 +436,12 @@ def union(self, other): ------- y : Index or TimedeltaIndex """ - if _is_convertible_to_index(other): + self._assert_can_do_setop(other) + if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) - except TypeError: + except (TypeError, ValueError): pass - this, other = self, other if this._can_fast_union(other): @@ -581,6 +581,7 @@ def intersection(self, other): ------- y : Index or TimedeltaIndex """ + self._assert_can_do_setop(other) if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) From d513986e75110b14e288753701e435660f3d115e Mon Sep 17 00:00:00 2001 From: Bernard Willers Date: Thu, 28 May 2015 21:07:35 -0400 Subject: [PATCH 15/26] BUG: Holiday(..) with both offset and observance raises NotImplementedError #10217 GH10217 --- doc/source/whatsnew/v0.17.0.txt | 2 ++ pandas/tests/test_tseries.py | 13 +++++++++++++ pandas/tseries/holiday.py | 3 +++ 3 files changed, 18 insertions(+) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index dae1342c3cd76..95d9fd3a3b806 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -38,6 +38,8 @@ Backwards incompatible API changes Other API Changes ^^^^^^^^^^^^^^^^^ +- ``Holiday`` now raises ``NotImplementedError`` if both ``offset`` and ``observance`` are used in constructor. (:issue:`102171`) + .. _whatsnew_0170.deprecations: Deprecations diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index 1b796ed2d83d1..035b3ac07342d 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -9,6 +9,8 @@ import pandas.lib as lib import pandas._period as period import pandas.algos as algos +from pandas.tseries.holiday import Holiday, SA, next_monday +from pandas import DateOffset class TestTseriesUtil(tm.TestCase): @@ -737,6 +739,17 @@ def test_get_period_field_raises_on_out_of_range(self): def test_get_period_field_array_raises_on_out_of_range(self): self.assertRaises(ValueError, period.get_period_field_arr, -1, np.empty(1), 0) + +class TestHolidayConflictingArguments(tm.TestCase): + + # GH 10217 + + def test_both_offset_observance_raises(self): + + with self.assertRaises(NotImplementedError) as cm: + h = Holiday("Cyber Monday", month=11, day=1, + offset=[DateOffset(weekday=SA(4))], observance=next_monday) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 799be98a329fa..f55569302ca05 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -148,6 +148,9 @@ class from pandas.tseries.offsets >>> July3rd = Holiday('July 3rd', month=7, day=3, days_of_week=(0, 1, 2, 3)) """ + if offset is not None and observance is not None: + raise NotImplementedError("Cannot use both offset and observance.") + self.name = name self.year = year self.month = month From caeafb7702eb5950be7b0107790e0966143eeaad Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 2 Jun 2015 15:42:43 -0400 Subject: [PATCH 16/26] DOC: move whatsnew from 0.17.0 -> 0.16.2 --- doc/source/whatsnew.rst | 2 +- .../whatsnew/{v0.17.0.txt => v0.16.2.txt} | 40 +++++++------------ 2 files changed, 15 insertions(+), 27 deletions(-) rename doc/source/whatsnew/{v0.17.0.txt => v0.16.2.txt} (72%) diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 24cee99a5d072..c8e32ac2a3309 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,7 +18,7 @@ What's New These are new features and improvements of note in each release. -.. include:: whatsnew/v0.17.0.txt +.. include:: whatsnew/v0.16.2.txt .. include:: whatsnew/v0.16.1.txt diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.16.2.txt similarity index 72% rename from doc/source/whatsnew/v0.17.0.txt rename to doc/source/whatsnew/v0.16.2.txt index feaaad179ff54..b571aab0b19a5 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -1,63 +1,52 @@ -.. _whatsnew_0170: +.. _whatsnew_0162: -v0.17.0 (July ??, 2015) +v0.16.2 (June 12, 2015) ----------------------- -This is a major release from 0.16.1 and includes a small number of API changes, several new features, -enhancements, and performance improvements along with a large number of bug fixes. We recommend that all -users upgrade to this version. +This is a minor bug-fix release from 0.16.1 and includes a a large number of +bug fixes along several new features, enhancements, and performance improvements. +We recommend that all users upgrade to this version. Highlights include: +Check the :ref:`API Changes ` before updating. -Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. - -.. contents:: What's new in v0.17.0 +.. contents:: What's new in v0.16.2 :local: :backlinks: none -.. _whatsnew_0170.enhancements: +.. _whatsnew_0162.enhancements: New features ~~~~~~~~~~~~ -.. _whatsnew_0170.enhancements.other: +.. _whatsnew_0162.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ -.. _whatsnew_0170.api: +.. _whatsnew_0162.api: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_0170.api_breaking: +.. _whatsnew_0162.api_breaking: -.. _whatsnew_0170.api_breaking.other: +.. _whatsnew_0162.api_breaking.other: Other API Changes ^^^^^^^^^^^^^^^^^ - ``Holiday`` now raises ``NotImplementedError`` if both ``offset`` and ``observance`` are used in constructor. (:issue:`102171`) -.. _whatsnew_0170.deprecations: - -Deprecations -^^^^^^^^^^^^ - -.. _whatsnew_0170.prior_deprecations: - -Removal of prior version deprecations/changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. _whatsnew_0170.performance: +.. _whatsnew_0162.performance: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved ``Series.resample`` performance with dtype=datetime64[ns] (:issue:`7754`) -.. _whatsnew_0170.bug_fixes: +.. _whatsnew_0162.bug_fixes: Bug Fixes ~~~~~~~~~ @@ -94,4 +83,3 @@ Bug Fixes - Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`) - From 676cb9541be1b09e5a719caee5ea789180cfaff1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 2 Jun 2015 17:01:46 -0400 Subject: [PATCH 17/26] DOC: add in whatsnew/0.17.0.txt --- doc/source/whatsnew/v0.17.0.txt | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 doc/source/whatsnew/v0.17.0.txt diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt new file mode 100644 index 0000000000000..87a9d197bd0d1 --- /dev/null +++ b/doc/source/whatsnew/v0.17.0.txt @@ -0,0 +1,59 @@ +.. _whatsnew_0170: + +v0.17.0 (July 31, 2015) +----------------------- + +This is a major release from 0.16.2 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +Highlights include: + + +Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. + +.. contents:: What's new in v0.17.0 + :local: + :backlinks: none + +.. _whatsnew_0170.enhancements: + +New features +~~~~~~~~~~~~ + +.. _whatsnew_0170.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0170.api: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0170.api_breaking: + +.. _whatsnew_0170.api_breaking.other: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0170.deprecations: + +Deprecations +^^^^^^^^^^^^ + +.. _whatsnew_0170.prior_deprecations: + +Removal of prior version deprecations/changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0170.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0170.bug_fixes: + +Bug Fixes +~~~~~~~~~ From c8bf1c4694014eb23fc4e0c03b7d468be60b65e6 Mon Sep 17 00:00:00 2001 From: Mortada Mehyar Date: Tue, 2 Jun 2015 14:19:18 -0700 Subject: [PATCH 18/26] ENH: groupby.apply for Categorical should preserve categories (closes #10138) --- doc/source/whatsnew/v0.16.2.txt | 2 +- pandas/core/groupby.py | 3 ++- pandas/tests/test_groupby.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index b571aab0b19a5..1a8fc90b9683f 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -55,7 +55,7 @@ Bug Fixes multi-indexed (:issue:`7212`) - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`) - +- Bug in groupby.apply aggregation for Categorical not preserving categories (:issue:`10138`) - Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`) - Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`) - Bug in ``Index.union`` raises ``AttributeError`` when passing array-likes. (:issue:`10149`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 51674bad60f5b..4abdd1112c721 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2944,7 +2944,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): cd = 'coerce' else: cd = True - return result.convert_objects(convert_dates=cd) + result = result.convert_objects(convert_dates=cd) + return self._reindex_output(result) else: # only coerce dates if we find at least 1 datetime diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 0789e20df3945..ab78bd63a7c94 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2595,6 +2595,35 @@ def get_stats(group): result = self.df.groupby(cats).D.apply(get_stats) self.assertEqual(result.index.names[0], 'C') + def test_apply_categorical_data(self): + # GH 10138 + for ordered in [True, False]: + dense = Categorical(list('abc'), ordered=ordered) + # 'b' is in the categories but not in the list + missing = Categorical(list('aaa'), categories=['a', 'b'], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({'missing': missing, + 'dense': dense, + 'values': values}) + grouped = df.groupby(['missing', 'dense']) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_product([['a', 'b'], ['a', 'b', 'c']], + names=['missing', 'dense']) + expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], + index=idx, + columns=['values']) + + assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) + assert_frame_equal(grouped.mean(), expected) + assert_frame_equal(grouped.agg(np.mean), expected) + + # but for transform we should still get back the original index + idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], + names=['missing', 'dense']) + expected = Series(1, index=idx) + assert_series_equal(grouped.apply(lambda x: 1), expected) + def test_apply_corner_cases(self): # #535, can't use sliding iterator From f8e7c93938eca1fa59c74131dfe3a137cc2a4d13 Mon Sep 17 00:00:00 2001 From: rekcahpassyla <0xdeadcafebeef@gmail.com> Date: Fri, 22 May 2015 10:06:46 +0100 Subject: [PATCH 19/26] BUG: Raise TypeError only if key DataFrame is not empty #10126 --- doc/source/whatsnew/v0.16.2.txt | 2 ++ pandas/core/frame.py | 2 +- pandas/tests/test_frame.py | 17 +++++++++++++++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index b571aab0b19a5..b409ea89a8032 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -83,3 +83,5 @@ Bug Fixes - Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`) + +- Bug to handle masking empty ``DataFrame``(:issue:`10126`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f36108262432d..ab6f11a4b8d5b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2151,7 +2151,7 @@ def _setitem_array(self, key, value): def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 - if key.values.dtype != np.bool_: + if key.values.size and not com.is_bool_dtype(key.values): raise TypeError('Must pass DataFrame with boolean values only') self._check_inplace_setting(value) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 4964d13f7ac28..f74cb07557342 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -794,6 +794,19 @@ def test_setitem_empty(self): result.loc[result.b.isnull(), 'a'] = result.a assert_frame_equal(result, df) + def test_setitem_empty_frame_with_boolean(self): + # Test for issue #10126 + + for dtype in ('float', 'int64'): + for df in [ + pd.DataFrame(dtype=dtype), + pd.DataFrame(dtype=dtype, index=[1]), + pd.DataFrame(dtype=dtype, columns=['A']), + ]: + df2 = df.copy() + df[df > df2] = 47 + assert_frame_equal(df, df2) + def test_delitem_corner(self): f = self.frame.copy() del f['D'] @@ -2821,7 +2834,7 @@ def custom_frame_function(self): data = {'col1': range(10), 'col2': range(10)} cdf = CustomDataFrame(data) - + # Did we get back our own DF class? self.assertTrue(isinstance(cdf, CustomDataFrame)) @@ -2833,7 +2846,7 @@ def custom_frame_function(self): # Do we get back our own DF class after slicing row-wise? cdf_rows = cdf[1:5] self.assertTrue(isinstance(cdf_rows, CustomDataFrame)) - self.assertEqual(cdf_rows.custom_frame_function(), 'OK') + self.assertEqual(cdf_rows.custom_frame_function(), 'OK') # Make sure sliced part of multi-index frame is custom class mcol = pd.MultiIndex.from_tuples([('A', 'A'), ('A', 'B')]) From e1e364eaf6456273e5822d03d0004273daec17a5 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 31 May 2015 22:45:46 +0900 Subject: [PATCH 20/26] BUG: SparseSeries constructor ignores input data name --- doc/source/whatsnew/v0.16.2.txt | 2 +- pandas/sparse/series.py | 3 +++ pandas/sparse/tests/test_sparse.py | 30 ++++++++++++++++++++---------- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index b571aab0b19a5..8e30e90087bcb 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -80,6 +80,6 @@ Bug Fixes - Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`) - +- Bug in ``SparseSeries`` constructor ignores input data name (:issue:`10258`) - Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index f53cc66bee961..24d06970f4741 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -121,6 +121,9 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if data is None: data = [] + if isinstance(data, Series) and name is None: + name = data.name + is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index a7a78ba226a0b..96e5ff87fbb0c 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -128,14 +128,15 @@ def setUp(self): date_index = bdate_range('1/1/2011', periods=len(index)) - self.bseries = SparseSeries(arr, index=index, kind='block') - self.bseries.name = 'bseries' + self.bseries = SparseSeries(arr, index=index, kind='block', + name='bseries') self.ts = self.bseries self.btseries = SparseSeries(arr, index=date_index, kind='block') - self.iseries = SparseSeries(arr, index=index, kind='integer') + self.iseries = SparseSeries(arr, index=index, kind='integer', + name='iseries') arr, index = _test_data2() self.bseries2 = SparseSeries(arr, index=index, kind='block') @@ -143,7 +144,7 @@ def setUp(self): arr, index = _test_data1_zero() self.zbseries = SparseSeries(arr, index=index, kind='block', - fill_value=0) + fill_value=0, name='zbseries') self.ziseries = SparseSeries(arr, index=index, kind='integer', fill_value=0) @@ -234,12 +235,21 @@ def test_constructor(self): self.bseries.to_dense().fillna(0).values) # pass SparseSeries - s2 = SparseSeries(self.bseries) - s3 = SparseSeries(self.iseries) - s4 = SparseSeries(self.zbseries) - assert_sp_series_equal(s2, self.bseries) - assert_sp_series_equal(s3, self.iseries) - assert_sp_series_equal(s4, self.zbseries) + def _check_const(sparse, name): + # use passed series name + result = SparseSeries(sparse) + assert_sp_series_equal(result, sparse) + self.assertEqual(sparse.name, name) + self.assertEqual(result.name, name) + + # use passed name + result = SparseSeries(sparse, name='x') + assert_sp_series_equal(result, sparse) + self.assertEqual(result.name, 'x') + + _check_const(self.bseries, 'bseries') + _check_const(self.iseries, 'iseries') + _check_const(self.zbseries, 'zbseries') # Sparse time series works date_index = bdate_range('1/1/2000', periods=len(self.bseries)) From 2e20eb7a0ff6f80a0f22bbf36ddb087c43962d5a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 3 Jun 2015 06:52:22 -0400 Subject: [PATCH 21/26] add numba example to enhancingperf.rst --- doc/source/enhancingperf.rst | 86 +++++++++++++++++++++++++++------ doc/source/whatsnew/v0.16.2.txt | 2 + 2 files changed, 72 insertions(+), 16 deletions(-) diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index d007446a5b922..54fd0a2131861 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -7,7 +7,7 @@ import os import csv - from pandas import DataFrame + from pandas import DataFrame, Series import pandas as pd pd.options.display.max_rows=15 @@ -68,9 +68,10 @@ Here's the function in pure python: We achieve our result by using ``apply`` (row-wise): -.. ipython:: python +.. code-block:: python - %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + In [7]: %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + 10 loops, best of 3: 174 ms per loop But clearly this isn't fast enough for us. Let's take a look and see where the time is spent during this operation (limited to the most time consuming @@ -97,7 +98,7 @@ First we're going to need to import the cython magic function to ipython: .. ipython:: python - %load_ext cythonmagic + %load_ext Cython Now, let's simply copy our functions over to cython as is (the suffix @@ -122,9 +123,10 @@ is here to distinguish between function versions): to be using bleeding edge ipython for paste to play well with cell magics. -.. ipython:: python +.. code-block:: python - %timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1) + In [4]: %timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1) + 10 loops, best of 3: 85.5 ms per loop Already this has shaved a third off, not too bad for a simple copy and paste. @@ -150,9 +152,10 @@ We get another huge improvement simply by providing type information: ...: return s * dx ...: -.. ipython:: python +.. code-block:: python - %timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1) + In [4]: %timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1) + 10 loops, best of 3: 20.3 ms per loop Now, we're talking! It's now over ten times faster than the original python implementation, and we haven't *really* modified the code. Let's have another @@ -229,9 +232,10 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra Loops like this would be *extremely* slow in python, but in Cython looping over numpy arrays is *fast*. -.. ipython:: python +.. code-block:: python - %timeit apply_integrate_f(df['a'].values, df['b'].values, df['N'].values) + In [4]: %timeit apply_integrate_f(df['a'].values, df['b'].values, df['N'].values) + 1000 loops, best of 3: 1.25 ms per loop We've gotten another big improvement. Let's check again where the time is spent: @@ -278,20 +282,70 @@ advanced cython techniques: ...: return res ...: -.. ipython:: python +.. code-block:: python - %timeit apply_integrate_f_wrap(df['a'].values, df['b'].values, df['N'].values) + In [4]: %timeit apply_integrate_f_wrap(df['a'].values, df['b'].values, df['N'].values) + 1000 loops, best of 3: 987 us per loop Even faster, with the caveat that a bug in our cython code (an off-by-one error, for example) might cause a segfault because memory access isn't checked. -Further topics -~~~~~~~~~~~~~~ +.. _enhancingperf.numba: + +Using numba +----------- + +A recent alternative to statically compiling cython code, is to use a *dynamic jit-compiler*, ``numba``. + +Numba gives you the power to speed up your applications with high performance functions written directly in Python. With a few annotations, array-oriented and math-heavy Python code can be just-in-time compiled to native machine instructions, similar in performance to C, C++ and Fortran, without having to switch languages or Python interpreters. + +Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool). Numba supports compilation of Python to run on either CPU or GPU hardware, and is designed to integrate with the Python scientific software stack. + +.. note:: + + You will need to install ``numba``. This is easy with ``conda``, by using: ``conda install numba``, see :ref:`installing using miniconda`. + +We simply take the plain python code from above and annotate with the ``@jit`` decorator. + +.. code-block:: python + + import numba + + @numba.jit + def f_plain(x): + return x * (x - 1) + + @numba.jit + def integrate_f_numba(a, b, N): + s = 0 + dx = (b - a) / N + for i in range(N): + s += f_plain(a + i * dx) + return s * dx + + @numba.jit + def apply_integrate_f_numba(col_a, col_b, col_N): + n = len(col_N) + result = np.empty(n, dtype='float64') + assert len(col_a) == len(col_b) == n + for i in range(n): + result[i] = integrate_f_numba(col_a[i], col_b[i], col_N[i]) + return result + + def compute_numba(df): + result = apply_integrate_f_numba(df['a'].values, df['b'].values, df['N'].values) + return Series(result, index=df.index, name='result') + +Similar to above, we directly pass ``numpy`` arrays directly to the numba function. Further +we are wrapping the results to provide a nice interface by passing/returning pandas objects. + +.. code-block:: python -- Loading C modules into cython. + In [4]: %timeit compute_numba(df) + 1000 loops, best of 3: 798 us per loop -Read more in the `cython docs `__. +Read more in the `numba docs `__. .. _enhancingperf.eval: diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index b571aab0b19a5..aede460766922 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -9,6 +9,8 @@ We recommend that all users upgrade to this version. Highlights include: +- Documentation on how to use ``numba`` with *pandas*, see :ref:`here ` + Check the :ref:`API Changes ` before updating. .. contents:: What's new in v0.16.2 From e9f83ce6676cf944369b20249062bf78b2742964 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 4 Jun 2015 06:55:39 -0400 Subject: [PATCH 22/26] TST: fix for bottleneck >= 1.0 nansum behavior, xref #9422 --- pandas/core/nanops.py | 8 ++++---- pandas/tests/test_series.py | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 0df160618b7c3..c64c50f791edf 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -16,7 +16,7 @@ ensure_float, _ensure_float64, _ensure_int64, _ensure_object, is_float, is_integer, is_complex, - is_float_dtype, is_floating_dtype, + is_float_dtype, is_complex_dtype, is_integer_dtype, is_bool_dtype, is_object_dtype, is_datetime64_dtype, is_timedelta64_dtype, @@ -373,7 +373,7 @@ def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) mask = isnull(values) - if not is_floating_dtype(values): + if not is_float_dtype(values.dtype): values = values.astype('f8') count, _ = _get_counts_nanvar(mask, axis, ddof) @@ -467,7 +467,7 @@ def nanargmin(values, axis=None, skipna=True): def nanskew(values, axis=None, skipna=True): mask = isnull(values) - if not is_floating_dtype(values): + if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) @@ -502,7 +502,7 @@ def nanskew(values, axis=None, skipna=True): def nankurt(values, axis=None, skipna=True): mask = isnull(values) - if not is_floating_dtype(values): + if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index bbe942e607faf..eb583f17f3ace 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2316,7 +2316,7 @@ def test_iteritems(self): self.assertFalse(hasattr(self.series.iteritems(), 'reverse')) def test_sum(self): - self._check_stat_op('sum', np.sum) + self._check_stat_op('sum', np.sum, check_allna=True) def test_sum_inf(self): import pandas.core.nanops as nanops @@ -2629,7 +2629,7 @@ def test_npdiff(self): r = np.diff(s) assert_series_equal(Series([nan, 0, 0, 0, nan]), r) - def _check_stat_op(self, name, alternate, check_objects=False): + def _check_stat_op(self, name, alternate, check_objects=False, check_allna=False): import pandas.core.nanops as nanops def testit(): @@ -2653,7 +2653,17 @@ def testit(): assert_almost_equal(f(self.series), alternate(nona.values)) allna = self.series * nan - self.assertTrue(np.isnan(f(allna))) + + if check_allna: + # xref 9422 + # bottleneck >= 1.0 give 0.0 for an allna Series sum + try: + self.assertTrue(nanops._USE_BOTTLENECK) + import bottleneck as bn + self.assertTrue(bn.__version__ >= LooseVersion('1.0')) + self.assertEqual(f(allna),0.0) + except: + self.assertTrue(np.isnan(f(allna))) # dtype=object with None, it works! s = Series([1, 2, 3, None, 5]) From 4698ffc4dd52bd2bbe24650c4bc742d23e37d4c3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 3 Jun 2015 16:00:31 -0400 Subject: [PATCH 23/26] PERF: write basic datetimes faster #10271 --- doc/source/whatsnew/v0.16.2.txt | 1 + pandas/tslib.pyx | 47 +++++++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index f1c5b0c854055..1430fa1a309be 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -47,6 +47,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved ``Series.resample`` performance with dtype=datetime64[ns] (:issue:`7754`) +- Modest improvement in datetime writing speed in to_csv (:issue:`10271`) .. _whatsnew_0162.bug_fixes: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 59eb432844ee3..8fda9bb31061e 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -5,6 +5,7 @@ from numpy cimport (int8_t, int32_t, int64_t, import_array, ndarray, NPY_INT64, NPY_DATETIME, NPY_TIMEDELTA) import numpy as np +from cpython.ref cimport PyObject from cpython cimport ( PyTypeObject, PyFloat_Check, @@ -12,13 +13,14 @@ from cpython cimport ( PyObject_RichCompareBool, PyObject_RichCompare, PyString_Check, - Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE + Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE, ) # Cython < 0.17 doesn't have this in cpython cdef extern from "Python.h": cdef PyTypeObject *Py_TYPE(object) int PySlice_Check(object) + object PyUnicode_FromFormat(const char*, ...) cdef extern from "datetime_helper.h": double total_seconds(object) @@ -1450,20 +1452,43 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object f elif basic_format: pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) - res = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, - dts.month, - dts.day, - dts.hour, - dts.min, - dts.sec) - if show_ns: ns = dts.ps / 1000 - res += '.%.9d' % (ns + 1000 * dts.us) + res = PyUnicode_FromFormat('%d-%02d-%02d %02d:%02d:%02d.%09d', + dts.year, + dts.month, + dts.day, + dts.hour, + dts.min, + dts.sec, + ns + 1000 * dts.us) elif show_us: - res += '.%.6d' % dts.us + res = PyUnicode_FromFormat('%d-%02d-%02d %02d:%02d:%02d.%06d', + dts.year, + dts.month, + dts.day, + dts.hour, + dts.min, + dts.sec, + dts.us) + elif show_ms: - res += '.%.3d' % (dts.us/1000) + res = PyUnicode_FromFormat('%d-%02d-%02d %02d:%02d:%02d.%03d', + dts.year, + dts.month, + dts.day, + dts.hour, + dts.min, + dts.sec, + dts.us/1000) + else: + res = PyUnicode_FromFormat('%d-%02d-%02d %02d:%02d:%02d', + dts.year, + dts.month, + dts.day, + dts.hour, + dts.min, + dts.sec) result[i] = res From 24633ec81de4b960a9213cc29aecc8e731024c60 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 4 Jun 2015 10:05:44 -0400 Subject: [PATCH 24/26] DOC: use current ipython in doc build --- ci/build_docs.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 583b36857c70c..ad41373f6dd3f 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -13,9 +13,10 @@ fi if [ x"$DOC_BUILD" != x"" ]; then + # we're running network tests, let's build the docs in the meantim echo "Will build docs" - pip install sphinx==1.1.3 ipython==1.1.0 + conda install sphinx==1.1.3 ipython mv "$TRAVIS_BUILD_DIR"/doc /tmp cd /tmp/doc From 20711816a7b436bd32d7ecc077184262edaba0d5 Mon Sep 17 00:00:00 2001 From: Thomas Grainger Date: Thu, 4 Jun 2015 15:43:45 +0100 Subject: [PATCH 25/26] Fix meantim typo --- ci/build_docs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index ad41373f6dd3f..8670ea61dbec2 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -14,7 +14,7 @@ fi if [ x"$DOC_BUILD" != x"" ]; then - # we're running network tests, let's build the docs in the meantim + # we're running network tests, let's build the docs in the meantime echo "Will build docs" conda install sphinx==1.1.3 ipython From bc7d48f5fcaaad96052c2c36f73dfdced40530bf Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 4 Jun 2015 09:23:34 -0400 Subject: [PATCH 26/26] disable some deps on 3.2 build --- ci/requirements-3.2.txt | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/ci/requirements-3.2.txt b/ci/requirements-3.2.txt index 9ba8fd7ca9393..8c2f675b65603 100644 --- a/ci/requirements-3.2.txt +++ b/ci/requirements-3.2.txt @@ -1,15 +1,4 @@ python-dateutil==2.1 pytz==2013b -xlsxwriter==0.4.6 -xlrd==0.9.2 numpy==1.7.1 cython==0.19.1 -numexpr==2.1 -tables==3.0.0 -matplotlib==1.2.1 -patsy==0.1.0 -lxml==3.2.1 -html5lib -scipy==0.12.0 -beautifulsoup4==4.2.1 -statsmodels==0.5.0