In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                   'k2':['alpha','beta','alpha','beta','alpha'],
                   'dataset1':np.random.randn(5),
                   'dataset2':np.random.randn(5)})

dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.891506,1.922186,X,alpha
1,1.855738,1.117859,X,beta
2,1.353094,0.481845,Y,alpha
3,0.562344,0.718297,Y,beta
4,0.120985,-0.15245,Z,alpha


In [3]:
# We will use groupby to create a Series 
# groupby object

group1 = dframe['dataset1'].groupby(dframe['k1'])

group1

<pandas.core.groupby.SeriesGroupBy object at 0x113a6e350>

In [4]:
# We can perform operations on the group

group1.mean()

k1
X    1.373622
Y    0.957719
Z    0.120985
Name: dataset1, dtype: float64

In [5]:
# We can use group keys that are Series as well

cities = np.array(['NY','LA','LA','NY','NY'])

month = np.array(['JAN','FEB','JAN','FEB','JAN'])

In [6]:
# Using the data from dataset one we will group
# the means by cities and month

dframe['dataset1'].groupby([cities, month]).mean()

LA  FEB    1.855738
    JAN    1.353094
NY  FEB    0.562344
    JAN    0.506246
Name: dataset1, dtype: float64

In [7]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.891506,1.922186,X,alpha
1,1.855738,1.117859,X,beta
2,1.353094,0.481845,Y,alpha
3,0.562344,0.718297,Y,beta
4,0.120985,-0.15245,Z,alpha


In [8]:
# We can pass column names as groupkeys 

dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,1.373622,1.520023
Y,0.957719,0.600071
Z,0.120985,-0.15245


In [9]:
# We can use multiple columns

dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,0.891506,1.922186
X,beta,1.855738,1.117859
Y,alpha,1.353094,0.481845
Y,beta,0.562344,0.718297
Z,alpha,0.120985,-0.15245


In [10]:
# we can also get sizes

dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [11]:
# We can iterate over groups

for name, group in dframe.groupby('k1'):
    print "This is the %s group" %name
    print group
    print '\n'

This is the X group
   dataset1  dataset2 k1     k2
0  0.891506  1.922186  X  alpha
1  1.855738  1.117859  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2  1.353094  0.481845  Y  alpha
3  0.562344  0.718297  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4  0.120985  -0.15245  Z  alpha




In [12]:
# We can iterate for multiple keys

for (k1,k2), group in dframe.groupby(['k1','k2']):
    print "Key1 = %s Key2 = %s" %(k1,k2)
    print group
    print '\n'

Key1 = X Key2 = alpha
   dataset1  dataset2 k1     k2
0  0.891506  1.922186  X  alpha


Key1 = X Key2 = beta
   dataset1  dataset2 k1    k2
1  1.855738  1.117859  X  beta


Key1 = Y Key2 = alpha
   dataset1  dataset2 k1     k2
2  1.353094  0.481845  Y  alpha


Key1 = Y Key2 = beta
   dataset1  dataset2 k1    k2
3  0.562344  0.718297  Y  beta


Key1 = Z Key2 = alpha
   dataset1  dataset2 k1     k2
4  0.120985  -0.15245  Z  alpha




In [13]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.891506,1.922186,X,alpha
1,1.855738,1.117859,X,beta
2,1.353094,0.481845,Y,alpha
3,0.562344,0.718297,Y,beta
4,0.120985,-0.15245,Z,alpha


In [14]:
# We can create a dictionary of the data pieces
group_dict = dict(list(dframe.groupby('k1')))

In [15]:
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.891506,1.922186,X,alpha
1,1.855738,1.117859,X,beta


In [19]:
# We could have grouped the dtypes

group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

In [20]:
group_dict_axis1 

{dtype('float64'):    dataset1  dataset2
 0  0.891506  1.922186
 1  1.855738  1.117859
 2  1.353094  0.481845
 3  0.562344  0.718297
 4  0.120985 -0.152450, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [21]:
# We can use groupby with columns

dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,1.922186
X,beta,1.117859
Y,alpha,0.481845
Y,beta,0.718297
Z,alpha,-0.15245


In [22]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.891506,1.922186,X,alpha
1,1.855738,1.117859,X,beta
2,1.353094,0.481845,Y,alpha
3,0.562344,0.718297,Y,beta
4,0.120985,-0.15245,Z,alpha
