In [1]:
import pandas as pd
import numpy as np


In [4]:
df=pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                 'key2' : ['one', 'two', 'one', 'two', 'one'],
                 'key3' : [1, 2, 2, 1, 2],
                 'data1': np.random.randn(5),
                 'data2': np.random.randn(5),
                 'data3': np.random.randn(5)})

df

Unnamed: 0,key1,key2,key3,data1,data2,data3
0,a,one,1,-1.730417,0.116359,-0.553355
1,a,two,2,1.347333,-0.182817,0.057174
2,b,one,2,0.69698,1.406488,1.425337
3,b,two,1,-1.21802,-0.003696,2.255758
4,a,one,2,-0.504637,-0.198924,1.232757


In [5]:
# Let's compute the mean of the data1 column using the labels from key1
grouped=df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f9932bd0b20>

In [6]:
#it is now a groupby object. I thas not computed anything yet. to compute the group means, we can call the groupby's mean method
grouped.mean()

key1
a   -0.295907
b   -0.260520
Name: data1, dtype: float64

In [9]:
# Let's compute the means of two other data groups, data 1 and data 2
grouped2=df['data2'].groupby(df['key2'])
grouped3=df['data3'].groupby(df['key3'])

print(grouped2.mean())
print()
print(grouped3.mean())

key2
one    0.441308
two   -0.093257
Name: data2, dtype: float64

key3
1    0.851202
2    0.905089
Name: data3, dtype: float64


In [10]:
# we can also get the mean of a data based on different keys
means=df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one    -1.117527
      two     1.347333
b     one     0.696980
      two    -1.218020
Name: data1, dtype: float64

In [11]:
#let's compute the means of data 1 based on all three keys
means=df['data1'].groupby([df['key1'], df['key2'], df['key3']]).mean()
means

key1  key2  key3
a     one   1      -1.730417
            2      -0.504637
      two   2       1.347333
b     one   2       0.696980
      two   1      -1.218020
Name: data1, dtype: float64

In [12]:
means.unstack()

Unnamed: 0_level_0,key3,1,2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-1.730417,-0.504637
a,two,,1.347333
b,one,,0.69698
b,two,-1.21802,


In [13]:
#Let's work on another example
states=np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years=np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states, years]).mean()

California  2005    1.347333
            2006    0.696980
Ohio        2005   -1.474219
            2006   -0.504637
Name: data1, dtype: float64

In [14]:
df.groupby('key1').mean()

Unnamed: 0_level_0,key3,data1,data2,data3
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,1.666667,-0.295907,-0.088461,0.245525
b,1.5,-0.26052,0.701396,1.840547


In [16]:
df.groupby(['key1', 'key2', 'key3']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,data1,data2,data3
key1,key2,key3,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,one,1,-1.730417,0.116359,-0.553355
a,one,2,-0.504637,-0.198924,1.232757
a,two,2,1.347333,-0.182817,0.057174
b,one,2,0.69698,1.406488,1.425337
b,two,1,-1.21802,-0.003696,2.255758


# # Iterating Over Groups

In [22]:

# The GroupBy object supports iteration, generating a sequence of 2-tuples containing
#the group name along with the chunk of data.
for name, group, in df.groupby('key1'):
    print('group:', name)
    print()
    print('group:', group)

group: a

group:   key1 key2  key3     data1     data2     data3
0    a  one     1 -1.730417  0.116359 -0.553355
1    a  two     2  1.347333 -0.182817  0.057174
4    a  one     2 -0.504637 -0.198924  1.232757
group: b

group:   key1 key2  key3    data1     data2     data3
2    b  one     2  0.69698  1.406488  1.425337
3    b  two     1 -1.21802 -0.003696  2.255758


In [23]:
# in case of multiple keys, the first element in the tuple will be a tuple of key values
for (k1, k2, k3), group in df.groupby(['key1', 'key2', 'key3']):
    print((k1, k2, k3))
    print()
    print(group)

('a', 'one', 1)

  key1 key2  key3     data1     data2     data3
0    a  one     1 -1.730417  0.116359 -0.553355
('a', 'one', 2)

  key1 key2  key3     data1     data2     data3
4    a  one     2 -0.504637 -0.198924  1.232757
('a', 'two', 2)

  key1 key2  key3     data1     data2     data3
1    a  two     2  1.347333 -0.182817  0.057174
('b', 'one', 2)

  key1 key2  key3    data1     data2     data3
2    b  one     2  0.69698  1.406488  1.425337
('b', 'two', 1)

  key1 key2  key3    data1     data2     data3
3    b  two     1 -1.21802 -0.003696  2.255758


In [24]:
# we can also groupby the data based on their dtype
grouped=df.groupby(df.dtypes, axis=1)
for dtype, group in grouped:
    print(dtype)
    print()
    print(group)

int64

   key3
0     1
1     2
2     2
3     1
4     2
float64

      data1     data2     data3
0 -1.730417  0.116359 -0.553355
1  1.347333 -0.182817  0.057174
2  0.696980  1.406488  1.425337
3 -1.218020 -0.003696  2.255758
4 -0.504637 -0.198924  1.232757
object

  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


# Selecting a Column or Subset of Columns

In [27]:
#especially for large datasets, it maybe desirable to aggregate only a few columns. for example:
s_grouped=df.groupby(['key1', 'key2'])['data2']
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f9932bd0be0>

In [28]:
s_grouped.mean()

key1  key2
a     one    -0.041282
      two    -0.182817
b     one     1.406488
      two    -0.003696
Name: data2, dtype: float64

# Grouping with Dicts and Series

In [29]:
#Grouping information may exist in a form other than an array. Let's take another exmaple of dataframe:
people=pd.DataFrame(np.random.randn(5, 5), 
                   columns=['a', 'b', 'c', 'd', 'e'], 
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
#let's add some NA values too
people.iloc[2:3, [1, 2]]=np.nan

people

Unnamed: 0,a,b,c,d,e
Joe,-0.621156,-1.056292,-0.384762,0.079134,-1.148095
Steve,-1.228201,-1.072177,-1.214872,0.811374,-1.109557
Wes,1.07778,,,-1.085668,1.068953
Jim,1.636542,0.176893,-0.027716,0.293731,0.456927
Travis,-0.373475,-1.009153,0.451314,-1.084408,-0.558276


In [30]:
# Now, suppose I have a group correspondence for the columns and want to sum
#together the columns by group:
mapping={'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

# Now, you could construct an array from this dict to pass to groupby, but instead we can just pass the dict.
by_column=people.groupby(mapping, axis=1)
by_column.sum()


Unnamed: 0,blue,red
Joe,-0.305628,-2.825542
Steve,-0.403498,-3.409935
Wes,-1.085668,2.146733
Jim,0.266015,2.270362
Travis,-0.633093,-1.940904


# Grouping with Functions

In [31]:
# Let's group by the length of the names this time
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,2.093166,-0.879399,-0.412478,-0.712803,0.377785
5,-1.228201,-1.072177,-1.214872,0.811374,-1.109557
6,-0.373475,-1.009153,0.451314,-1.084408,-0.558276


# Grouping by Index Levels

In [34]:
# A final convenience for hierarchically indexed datasets is the ability to aggregate
#using one of the levels of an axis index. Let’s look at an example:
columns=pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], 
                                 [1, 3, 5, 1, 3]],
                                 names=['cty', 'tenor'])
hier_df=pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.595646,-0.69063,-0.429561,-0.209826,0.448144
1,1.974914,-0.97542,1.180194,1.814733,0.427766
2,0.562899,0.431591,-0.416148,-0.815889,-0.871456
3,-2.114641,-0.477158,-0.645794,0.432498,0.702931


In [35]:
# to group by leve, just pass the level number or name using the level keyword:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


# Data Aggregation

In [36]:
df.head()

Unnamed: 0,key1,key2,key3,data1,data2,data3
0,a,one,1,-1.730417,0.116359,-0.553355
1,a,two,2,1.347333,-0.182817,0.057174
2,b,one,2,0.69698,1.406488,1.425337
3,b,two,1,-1.21802,-0.003696,2.255758
4,a,one,2,-0.504637,-0.198924,1.232757


In [37]:
grouped=df.groupby('key1')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9932de3ee0>

In [38]:
grouped['data1'].quantile(0.9)

key1
a    0.976939
b    0.505480
Name: data1, dtype: float64

In [39]:
#agg method
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)

Unnamed: 0_level_0,key3,data1,data2,data3
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,1,3.07775,0.315282,1.786112
b,1,1.915,1.410185,0.830421


In [40]:
#You may notice that some methods like describe also work, even though they are not
#aggregations, strictly speaking:
grouped.describe()

Unnamed: 0_level_0,key3,key3,key3,key3,key3,key3,key3,key3,data1,data1,...,data2,data2,data3,data3,data3,data3,data3,data3,data3,data3
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
a,3.0,1.666667,0.57735,1.0,1.5,2.0,2.0,2.0,3.0,-0.295907,...,-0.033229,0.116359,3.0,0.245525,0.90783,-0.553355,-0.24809,0.057174,0.644966,1.232757
b,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,2.0,-0.26052,...,1.053942,1.406488,2.0,1.840547,0.587196,1.425337,1.632942,1.840547,2.048153,2.255758


In [None]:
#pg 298