# 12.2 Advanced GroupBy Use

In [1]:
import numpy as np
import pandas as pd

---

## Group Transforms and "Unwrapped" GroupBys

There is another built-in method called transform, which is similar
to apply but imposes more constraints on the kind of function you can use:
- It can produce a scalar value to be broadcast to the shape of the group
- It can produce an object of the same shape as the input group
- It must not mutate its input

In [2]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4, 'value': np.arange(12.)})

In [3]:
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [9]:
g = df.groupby('key').value

In [11]:
g.mean()

Unnamed: 0_level_0,value
key,Unnamed: 1_level_1
a,4.5
b,5.5
c,6.5


In [12]:
g.transform(lambda x: x.mean())

Unnamed: 0,value
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


In [13]:
g.transform('mean')

Unnamed: 0,value
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


In [15]:
g.transform(lambda x: x.max() - x.min())

Unnamed: 0,value
0,9.0
1,9.0
2,9.0
3,9.0
4,9.0
5,9.0
6,9.0
7,9.0
8,9.0
9,9.0


In [17]:
g.transform(lambda x: x * 2)

Unnamed: 0,value
0,0.0
1,2.0
2,4.0
3,6.0
4,8.0
5,10.0
6,12.0
7,14.0
8,16.0
9,18.0


In [18]:
g.transform(lambda x: x.rank(ascending=False))

Unnamed: 0,value
0,4.0
1,4.0
2,4.0
3,3.0
4,3.0
5,3.0
6,2.0
7,2.0
8,2.0
9,1.0


In [19]:
def normalize(x):
    return (x - x.mean()) / x.std()

In [20]:
g.transform(normalize)

Unnamed: 0,value
0,-1.161895
1,-1.161895
2,-1.161895
3,-0.387298
4,-0.387298
5,-0.387298
6,0.387298
7,0.387298
8,0.387298
9,1.161895


In [21]:
g.apply(normalize)

Unnamed: 0,value
0,-1.161895
1,-1.161895
2,-1.161895
3,-0.387298
4,-0.387298
5,-0.387298
6,0.387298
7,0.387298
8,0.387298
9,1.161895


Built-in aggregate functions like 'mean' or 'sum' are often much faster than a general
apply function. These also have a “fast past” when used with transform. This allows
us to perform a so-called unwrapped group operation:

In [22]:
g.transform('mean')

Unnamed: 0,value
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


In [26]:
normalized = (df['value'] - g.transform('mean')) / g.transform('std')

In [25]:
normalized

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,value
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
5,,,,,,,,,,,,,
6,,,,,,,,,,,,,
7,,,,,,,,,,,,,
8,,,,,,,,,,,,,
9,,,,,,,,,,,,,


---

## Grouped Time Resampling

In [27]:
N = 15
times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)
df = pd.DataFrame({'time': times, 'value': np.arange(N)})

In [28]:
df

Unnamed: 0,time,value
0,2017-05-20 00:00:00,0
1,2017-05-20 00:01:00,1
2,2017-05-20 00:02:00,2
3,2017-05-20 00:03:00,3
4,2017-05-20 00:04:00,4
5,2017-05-20 00:05:00,5
6,2017-05-20 00:06:00,6
7,2017-05-20 00:07:00,7
8,2017-05-20 00:08:00,8
9,2017-05-20 00:09:00,9


In [30]:
df = df.set_index('time')

In [31]:
df

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,0
2017-05-20 00:01:00,1
2017-05-20 00:02:00,2
2017-05-20 00:03:00,3
2017-05-20 00:04:00,4
2017-05-20 00:05:00,5
2017-05-20 00:06:00,6
2017-05-20 00:07:00,7
2017-05-20 00:08:00,8
2017-05-20 00:09:00,9


In [32]:
df.resample('5min').count()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,5
2017-05-20 00:05:00,5
2017-05-20 00:10:00,5


In [33]:
df2 = pd.DataFrame({'time': times.repeat(3), 
                    'key': np.tile(['a', 'b', 'c'], N),
                    'value': np.arange(N * 3.)})

In [35]:
times.repeat(2)

DatetimeIndex(['2017-05-20 00:00:00', '2017-05-20 00:00:00',
               '2017-05-20 00:01:00', '2017-05-20 00:01:00',
               '2017-05-20 00:02:00', '2017-05-20 00:02:00',
               '2017-05-20 00:03:00', '2017-05-20 00:03:00',
               '2017-05-20 00:04:00', '2017-05-20 00:04:00',
               '2017-05-20 00:05:00', '2017-05-20 00:05:00',
               '2017-05-20 00:06:00', '2017-05-20 00:06:00',
               '2017-05-20 00:07:00', '2017-05-20 00:07:00',
               '2017-05-20 00:08:00', '2017-05-20 00:08:00',
               '2017-05-20 00:09:00', '2017-05-20 00:09:00',
               '2017-05-20 00:10:00', '2017-05-20 00:10:00',
               '2017-05-20 00:11:00', '2017-05-20 00:11:00',
               '2017-05-20 00:12:00', '2017-05-20 00:12:00',
               '2017-05-20 00:13:00', '2017-05-20 00:13:00',
               '2017-05-20 00:14:00', '2017-05-20 00:14:00'],
              dtype='datetime64[ns]', freq=None)

In [37]:
df2[:7]

Unnamed: 0,time,key,value
0,2017-05-20 00:00:00,a,0.0
1,2017-05-20 00:00:00,b,1.0
2,2017-05-20 00:00:00,c,2.0
3,2017-05-20 00:01:00,a,3.0
4,2017-05-20 00:01:00,b,4.0
5,2017-05-20 00:01:00,c,5.0
6,2017-05-20 00:02:00,a,6.0


In [41]:
time_key = pd.TimeGrouper('5min')

  """Entry point for launching an IPython kernel.


In [47]:
gg = pd.Grouper(level=0, freq='5T')

In [42]:
resampled = df2.set_index('time').groupby(['key', time_key]).sum()

In [48]:
df2.set_index('time').groupby(['key', gg]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,value
key,time,Unnamed: 2_level_1
a,2017-05-20 00:00:00,30.0
a,2017-05-20 00:05:00,105.0
a,2017-05-20 00:10:00,180.0
b,2017-05-20 00:00:00,35.0
b,2017-05-20 00:05:00,110.0
b,2017-05-20 00:10:00,185.0
c,2017-05-20 00:00:00,40.0
c,2017-05-20 00:05:00,115.0
c,2017-05-20 00:10:00,190.0


In [49]:
resampled

Unnamed: 0_level_0,Unnamed: 1_level_0,value
key,time,Unnamed: 2_level_1
a,2017-05-20 00:00:00,30.0
a,2017-05-20 00:05:00,105.0
a,2017-05-20 00:10:00,180.0
b,2017-05-20 00:00:00,35.0
b,2017-05-20 00:05:00,110.0
b,2017-05-20 00:10:00,185.0
c,2017-05-20 00:00:00,40.0
c,2017-05-20 00:05:00,115.0
c,2017-05-20 00:10:00,190.0


---