# Group Operations
Notes on data aggregation and group operations, in pandas.

In [4]:
from pandas import DataFrame, Series
import numpy as np
import pandas as pd

df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
        'key2' : ['one', 'two', 'one', 'two', 'one'],
        'data1' : np.random.randn(5),
        'data2' : np.random.randn(5)})
df.head()

Unnamed: 0,data1,data2,key1,key2
0,0.054774,0.016986,a,one
1,-1.231995,-0.573065,a,two
2,1.093271,0.168547,b,one
3,-1.221024,0.379398,b,two
4,1.160288,-0.757718,a,one


In [6]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x111b1aa50>

In [7]:
grouped.mean()

key1
a   -0.005645
b   -0.063877
Name: data1, dtype: float64

In [9]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.607531
      two    -1.231995
b     one     1.093271
      two    -1.221024
Name: data1, dtype: float64

In [10]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.607531,-1.231995
b,1.093271,-1.221024


In [11]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.005645,-0.437932
b,-0.063877,0.273973


In [13]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.607531,-0.370366
a,two,-1.231995,-0.573065
b,one,1.093271,0.168547
b,two,-1.221024,0.379398


In [14]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [19]:
# iterating over groups
for name, group in df.groupby('key1'):
    print name
    print group
    print("\n")

a
      data1     data2 key1 key2
0  0.054774  0.016986    a  one
1 -1.231995 -0.573065    a  two
4  1.160288 -0.757718    a  one


b
      data1     data2 key1 key2
2  1.093271  0.168547    b  one
3 -1.221024  0.379398    b  two




In [20]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print k1, k2
    print group

a one
      data1     data2 key1 key2
0  0.054774  0.016986    a  one
4  1.160288 -0.757718    a  one
a two
      data1     data2 key1 key2
1 -1.231995 -0.573065    a  two
b one
      data1     data2 key1 key2
2  1.093271  0.168547    b  one
b two
      data1     data2 key1 key2
3 -1.221024  0.379398    b  two


In [21]:
# copmute dict of data pieces, as a one liner
pieces = dict(list(df.groupby('key1')))

In [22]:
pieces

{'a':       data1     data2 key1 key2
 0  0.054774  0.016986    a  one
 1 -1.231995 -0.573065    a  two
 4  1.160288 -0.757718    a  one, 'b':       data1     data2 key1 key2
 2  1.093271  0.168547    b  one
 3 -1.221024  0.379398    b  two}

In [23]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,1.093271,0.168547,b,one
3,-1.221024,0.379398,b,two


In [24]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [29]:
# Selecting a Column or Subset of Columns
df.groupby('key1')['data1'].head()

0    0.054774
1   -1.231995
2    1.093271
3   -1.221024
4    1.160288
Name: data1, dtype: float64

In [33]:
# get just the means for data2 and store the results as a DatFrame
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.370366
a,two,-0.573065
b,one,0.168547
b,two,0.379398


grouping dicts and sets

In [41]:
people = DataFrame(np.random.randn(5, 5),
    columns=['a', 'b', 'c', 'd', 'e'],
    index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

In [42]:
people.ix[2:3, ['b','c']] = np.nan # add a few NA values

In [43]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.053939,0.276785,0.682672,0.62473,-0.863377
Steve,0.562764,-1.3582,-1.165302,-1.098875,0.731043
Wes,-1.432546,,,0.824329,0.109087
Jim,-0.035302,1.028485,0.984357,0.567416,0.964549
Travis,-1.089031,-1.851419,1.207033,2.183648,-0.368128


In [46]:
# now, we can sum together columns with a grouping
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
            'd': 'blue', 'e': 'red', 'f' : 'orange'}
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [52]:
# can easily construct an array from this dict to pass to groupby,
# but instead can just pass the dict
by_column = people.groupby(mapping, axis=1)

In [53]:
by_column.sum()

Unnamed: 0,blue,red
Joe,1.307402,-0.532653
Steve,-2.264177,-0.064393
Wes,0.824329,-1.323459
Jim,1.551772,1.957732
Travis,3.390681,-3.308578


In [50]:
map_series = Series(mapping)

In [51]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [54]:
 people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-1.413909,1.30527,1.667028,2.016475,0.210259
5,0.562764,-1.3582,-1.165302,-1.098875,0.731043
6,-1.089031,-1.851419,1.207033,2.183648,-0.368128


In [55]:
key_list = ['one', 'one', 'one', 'two', 'two']

In [56]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.432546,0.276785,0.682672,0.62473,-0.863377
3,two,-0.035302,1.028485,0.984357,0.567416,0.964549
5,one,0.562764,-1.3582,-1.165302,-1.098875,0.731043
6,two,-1.089031,-1.851419,1.207033,2.183648,-0.368128


In [57]:
# Grouping by Index Levels
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
            [1, 3, 5, 1, 3]], names=['cty', 'tenor'])
columns

MultiIndex(levels=[[u'JP', u'US'], [1, 3, 5]],
           labels=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
           names=[u'cty', u'tenor'])

In [58]:
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)

In [59]:
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,2.124351,0.846436,1.61045,-0.184372,1.247362
1,0.572495,-0.120809,-0.930932,0.757421,1.278665
2,0.141438,-0.539621,0.85333,-0.397014,1.396874
3,-0.913211,0.154684,-0.803133,1.228583,-0.51037


In [60]:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [61]:
# Data Aggregation
# reduce an array or series value to 1 single value
# pandas does this efficiently
df

Unnamed: 0,data1,data2,key1,key2
0,0.054774,0.016986,a,one
1,-1.231995,-0.573065,a,two
2,1.093271,0.168547,b,one
3,-1.221024,0.379398,b,two
4,1.160288,-0.757718,a,one


In [62]:
grouped = df.groupby('key1')

In [63]:
grouped['data1'].quantile(0.9)

key1
a    0.939185
b    0.861841
Name: data1, dtype: float64

In [66]:
# create your own agg functions as long as they aggregate an array
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [67]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.392283,0.774704
b,2.314294,0.210851


In [68]:
grouped.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,count,3.0,3.0
a,mean,-0.005645,-0.437932
a,std,1.197285,0.404645
a,min,-1.231995,-0.757718
a,25%,-0.588611,-0.665391
a,50%,0.054774,-0.573065
a,75%,0.607531,-0.278039
a,max,1.160288,0.016986
b,count,2.0,2.0
b,mean,-0.063877,0.273973


In [71]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.054774,0.016986,a,one
1,-1.231995,-0.573065,a,two
2,1.093271,0.168547,b,one
3,-1.221024,0.379398,b,two
4,1.160288,-0.757718,a,one


In [69]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')

In [70]:
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.005645,-0.437932
b,-0.063877,0.273973


In [72]:
pd.merge(df, k1_means, left_on='key1', right_index=True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,0.054774,0.016986,a,one,-0.005645,-0.437932
1,-1.231995,-0.573065,a,two,-0.005645,-0.437932
4,1.160288,-0.757718,a,one,-0.005645,-0.437932
2,1.093271,0.168547,b,one,-0.063877,0.273973
3,-1.221024,0.379398,b,two,-0.063877,0.273973


In [73]:
key = ['one', 'two', 'one', 'two', 'one']

In [74]:
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-0.822546,-0.787317,0.944852,1.210902,-0.374139
two,0.263731,-0.164857,-0.090472,-0.26573,0.847796


In [75]:
def demean(arr):
    return arr - arr.mean()

In [76]:
# transform appplies a function to each group, 
# and places the result in the appropriate locations
demeaned = people.groupby(key).transform(demean)

In [77]:
demeaned

Unnamed: 0,a,b,c,d,e
Joe,0.876485,1.064102,-0.262181,-0.586172,-0.489238
Steve,0.299033,-1.193343,-1.074829,-0.833145,-0.116753
Wes,-0.61,,,-0.386573,0.483227
Jim,-0.299033,1.193343,1.074829,0.833145,0.116753
Travis,-0.266485,-1.064102,0.262181,0.972745,0.006011


In [78]:
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,1.110223e-16,0.0,0.0,-7.401487e-17,1.850372e-17
two,-2.775558e-17,0.0,0.0,0.0,0.0


In [None]:
# pivot tables
tips.pivot_table(rows=['sex', 'smoker'])

In [None]:
# or
tips.pivot_table(['tip_pct', 'size'], rows=['sex', 'day'],
        cols='smoker')