Skip to content

Commit

Permalink
PERF: improve performance of groupby rank (pandas-dev#21237) (pandas-…
Browse files Browse the repository at this point in the history
  • Loading branch information
peterpanmj authored and alimcmaster1 committed Jun 28, 2018
1 parent 7d664d5 commit 7463576
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 54 deletions.
21 changes: 20 additions & 1 deletion asv_bench/benchmarks/groupby.py
Expand Up @@ -5,7 +5,7 @@

import numpy as np
from pandas import (DataFrame, Series, MultiIndex, date_range, period_range,
TimeGrouper, Categorical)
TimeGrouper, Categorical, Timestamp)
import pandas.util.testing as tm

from .pandas_vb_common import setup # noqa
Expand Down Expand Up @@ -385,6 +385,25 @@ def time_dtype_as_field(self, dtype, method, application):
self.as_field_method()


class RankWithTies(object):
# GH 21237
goal_time = 0.2
param_names = ['dtype', 'tie_method']
params = [['float64', 'float32', 'int64', 'datetime64'],
['first', 'average', 'dense', 'min', 'max']]

def setup(self, dtype, tie_method):
N = 10**4
if dtype == 'datetime64':
data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
else:
data = np.array([1] * N, dtype=dtype)
self.df = DataFrame({'values': data, 'key': ['foo'] * N})

def time_rank_ties(self, dtype, tie_method):
self.df.groupby('key').rank(method=tie_method)


class Float32(object):
# GH 13335
goal_time = 0.2
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Expand Up @@ -65,6 +65,7 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
-

.. _whatsnew_0240.docs:
Expand Down
103 changes: 50 additions & 53 deletions pandas/_libs/groupby_helper.pxi.in
Expand Up @@ -429,7 +429,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
is_datetimelike : bool, default False
unused in this method but provided for call compatibility with other
Cython transformations
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
'average'
* average: average rank of group
* min: lowest rank in group
* max: highest rank in group
Expand Down Expand Up @@ -514,26 +515,22 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
dups += 1
sum_ranks += i - grp_start + 1

# if keep_na, check for missing values and assign back
# to the result where appropriate

if keep_na and mask[_as[i]]:
grp_na_count += 1
out[_as[i], 0] = nan
else:
# this implementation is inefficient because it will
# continue overwriting previously encountered dups
# i.e. if 5 duplicated values are encountered it will
# write to the result as follows (assumes avg tiebreaker):
# 1
# .5 .5
# .33 .33 .33
# .25 .25 .25 .25
# .2 .2 .2 .2 .2
#
# could potentially be optimized to only write to the
# result once the last duplicate value is encountered
if tiebreak == TIEBREAK_AVERAGE:
# Update out only when there is a transition of values or labels.
# When a new value or group is encountered, go back #dups steps(
# the number of occurrence of current value) and assign the ranks
# based on the the starting index of the current group (grp_start)
# and the current index
if (i == N - 1 or
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
(mask[_as[i]] ^ mask[_as[i+1]]) or
(labels[_as[i]] != labels[_as[i+1]])):
# if keep_na, check for missing values and assign back
# to the result where appropriate
if keep_na and mask[_as[i]]:
for j in range(i - dups + 1, i + 1):
out[_as[j], 0] = nan
grp_na_count = dups
elif tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
out[_as[j], 0] = sum_ranks / <float64_t>dups
elif tiebreak == TIEBREAK_MIN:
Expand All @@ -552,38 +549,38 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
for j in range(i - dups + 1, i + 1):
out[_as[j], 0] = grp_vals_seen

# look forward to the next value (using the sorting in _as)
# if the value does not equal the current value then we need to
# reset the dups and sum_ranks, knowing that a new value is coming
# up. the conditional also needs to handle nan equality and the
# end of iteration
if (i == N - 1 or
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
(mask[_as[i]] ^ mask[_as[i+1]])):
dups = sum_ranks = 0
val_start = i
grp_vals_seen += 1
grp_tie_count +=1

# Similar to the previous conditional, check now if we are moving
# to a new group. If so, keep track of the index where the new
# group occurs, so the tiebreaker calculations can decrement that
# from their position. fill in the size of each group encountered
# (used by pct calculations later). also be sure to reset any of
# the items helping to calculate dups
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
if tiebreak != TIEBREAK_DENSE:
for j in range(grp_start, i + 1):
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
else:
for j in range(grp_start, i + 1):
grp_sizes[_as[j], 0] = (grp_tie_count -
(grp_na_count > 0))
dups = sum_ranks = 0
grp_na_count = 0
grp_tie_count = 0
grp_start = i + 1
grp_vals_seen = 1
# look forward to the next value (using the sorting in _as)
# if the value does not equal the current value then we need to
# reset the dups and sum_ranks, knowing that a new value is
# coming up. the conditional also needs to handle nan equality
# and the end of iteration
if (i == N - 1 or
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
(mask[_as[i]] ^ mask[_as[i+1]])):
dups = sum_ranks = 0
grp_vals_seen += 1
grp_tie_count += 1

# Similar to the previous conditional, check now if we are
# moving to a new group. If so, keep track of the index where
# the new group occurs, so the tiebreaker calculations can
# decrement that from their position. fill in the size of each
# group encountered (used by pct calculations later). also be
# sure to reset any of the items helping to calculate dups
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
if tiebreak != TIEBREAK_DENSE:
for j in range(grp_start, i + 1):
grp_sizes[_as[j], 0] = (i - grp_start + 1 -
grp_na_count)
else:
for j in range(grp_start, i + 1):
grp_sizes[_as[j], 0] = (grp_tie_count -
(grp_na_count > 0))
dups = sum_ranks = 0
grp_na_count = 0
grp_tie_count = 0
grp_start = i + 1
grp_vals_seen = 1

if pct:
for i in range(N):
Expand Down

0 comments on commit 7463576

Please sign in to comment.