In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
dframe = DataFrame({'k1':['x','x','y','y','z'],
                    'k2':['alpha', 'beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-1.216079,-0.801128,x,alpha
1,1.768261,0.243129,x,beta
2,0.514338,-0.611582,y,alpha
3,0.161987,0.482563,y,beta
4,0.988389,0.320102,z,alpha


In [4]:
# group by can be used to create a Series group by object
# grab dataset1 column and group by k1 key
group1 = dframe['dataset1'].groupby(dframe['k1'])
group1

<pandas.core.groupby.SeriesGroupBy object at 0x7f1fc4502f50>

In [5]:
group1.mean()

k1
x    0.276091
y    0.338163
z    0.988389
Name: dataset1, dtype: float64

In [6]:
cities = np.array(['LA','NY','LA','NY','NY'])
months = np.array(['JAN','FEB','JAN','FEB','JAN'])

In [10]:
# line up cities and months and average out dataset1 based on the two arrays
dframe['dataset1'].groupby([cities,months]).mean()

LA  JAN   -0.350870
NY  FEB    0.965124
    JAN    0.988389
Name: dataset1, dtype: float64

In [11]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-1.216079,-0.801128,x,alpha
1,1.768261,0.243129,x,beta
2,0.514338,-0.611582,y,alpha
3,0.161987,0.482563,y,beta
4,0.988389,0.320102,z,alpha


In [12]:
# passing col names as group keys
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
x,0.276091,-0.278999
y,0.338163,-0.064509
z,0.988389,0.320102


In [14]:
# mean on both keys
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
x,alpha,-1.216079,-0.801128
x,beta,1.768261,0.243129
y,alpha,0.514338,-0.611582
y,beta,0.161987,0.482563
z,alpha,0.988389,0.320102


In [15]:
dframe.groupby('k1').size()

k1
x    2
y    2
z    1
dtype: int64

In [16]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-1.216079,-0.801128,x,alpha
1,1.768261,0.243129,x,beta
2,0.514338,-0.611582,y,alpha
3,0.161987,0.482563,y,beta
4,0.988389,0.320102,z,alpha


In [17]:
# iterating over groups
for name,group in dframe.groupby('k1'):
    print "This is the %s group" %name
    print group
    print '\n'

This is the x group
   dataset1  dataset2 k1     k2
0 -1.216079 -0.801128  x  alpha
1  1.768261  0.243129  x   beta


This is the y group
   dataset1  dataset2 k1     k2
2  0.514338 -0.611582  y  alpha
3  0.161987  0.482563  y   beta


This is the z group
   dataset1  dataset2 k1     k2
4  0.988389  0.320102  z  alpha




In [26]:
for (k1,k2), group in dframe.groupby(['k1','k2']):
    print "Key1 = %s Key2 = %s" %(k1,k2)
    print group
    print "\n"

Key1 = x Key2 = alpha
   dataset1  dataset2 k1     k2
0 -1.216079 -0.801128  x  alpha


Key1 = x Key2 = beta
   dataset1  dataset2 k1    k2
1  1.768261  0.243129  x  beta


Key1 = y Key2 = alpha
   dataset1  dataset2 k1     k2
2  0.514338 -0.611582  y  alpha


Key1 = y Key2 = beta
   dataset1  dataset2 k1    k2
3  0.161987  0.482563  y  beta


Key1 = z Key2 = alpha
   dataset1  dataset2 k1     k2
4  0.988389  0.320102  z  alpha




In [30]:
group_dict = dict(list(dframe.groupby('k1')))
group_dict


{'x':    dataset1  dataset2 k1     k2
 0 -1.216079 -0.801128  x  alpha
 1  1.768261  0.243129  x   beta, 'y':    dataset1  dataset2 k1     k2
 2  0.514338 -0.611582  y  alpha
 3  0.161987  0.482563  y   beta, 'z':    dataset1  dataset2 k1     k2
 4  0.988389  0.320102  z  alpha}

In [31]:
group_dict['x']

Unnamed: 0,dataset1,dataset2,k1,k2
0,-1.216079,-0.801128,x,alpha
1,1.768261,0.243129,x,beta


In [34]:
# separate data frame by types of data along the columns
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes, axis=1)))
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -1.216079 -0.801128
 1  1.768261  0.243129
 2  0.514338 -0.611582
 3  0.161987  0.482563
 4  0.988389  0.320102, dtype('O'):   k1     k2
 0  x  alpha
 1  x   beta
 2  y  alpha
 3  y   beta
 4  z  alpha}

In [39]:
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]
dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
x,alpha,-0.801128
x,beta,0.243129
y,alpha,-0.611582
y,beta,0.482563
z,alpha,0.320102
