In [3]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [4]:
#Let's make a dframe
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})

#Show
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.123544,1.924614,X,alpha
1,-1.448666,0.477115,X,beta
2,-1.139759,-1.378362,Y,alpha
3,-0.617664,-0.105714,Y,beta
4,-0.573748,0.409242,Z,alpha


In [5]:
#Now let's see how to use groupby

#Lets grab the dataset1 column and group it by the k1 key
group1 = dframe['dataset1'].groupby(dframe['k1'])

#Show the groupby object
group1

<pandas.core.groupby.SeriesGroupBy object at 0x000000000AF84780>

In [6]:
#Now we can perform operations on this particular group
group1.mean()

k1
X    -0.786105
Y    -0.878712
Z    -0.573748
Name: dataset1, dtype: float64

In [7]:
# We can use group keys that are series as well

#For example:

#We'll make some arrays for use as keys
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['JAN','FEB','JAN','FEB','JAN'])

#Now using the data from dataset1, group the means by city and month
dframe['dataset1'].groupby([cities,month]).mean()

LA  FEB   -1.448666
    JAN   -1.139759
NY  FEB   -0.617664
    JAN   -0.348646
Name: dataset1, dtype: float64

In [8]:
# let's see the original dframe again.
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.123544,1.924614,X,alpha
1,-1.448666,0.477115,X,beta
2,-1.139759,-1.378362,Y,alpha
3,-0.617664,-0.105714,Y,beta
4,-0.573748,0.409242,Z,alpha


In [9]:
# WE can also pass column names as group keys
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.786105,1.200865
Y,-0.878712,-0.742038
Z,-0.573748,0.409242


In [10]:
# Or multiple column names
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-0.123544,1.924614
X,beta,-1.448666,0.477115
Y,alpha,-1.139759,-1.378362
Y,beta,-0.617664,-0.105714
Z,alpha,-0.573748,0.409242


In [11]:
# Another useful groupby method is getting the group sizes
dframe.groupby(['k1']).size()

k1
X     2
Y     2
Z     1
dtype: int64

In [12]:
# We can also iterate over groups

#For example:
for name,group in dframe.groupby('k1'):
    print "This is the %s group" %name
    print group
    print '\n'

This is the X group
   dataset1  dataset2 k1     k2
0 -0.123544  1.924614  X  alpha
1 -1.448666  0.477115  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2 -1.139759 -1.378362  Y  alpha
3 -0.617664 -0.105714  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4 -0.573748  0.409242  Z  alpha




In [13]:
# We can also iterate with multiple keys
for (k1,k2) , group in dframe.groupby(['k1','k2']):
    print "Key1 = %s Key2 = %s" %(k1,k2)
    print group
    print '\n'

Key1 = X Key2 = alpha
   dataset1  dataset2 k1     k2
0 -0.123544  1.924614  X  alpha


Key1 = X Key2 = beta
   dataset1  dataset2 k1    k2
1 -1.448666  0.477115  X  beta


Key1 = Y Key2 = alpha
   dataset1  dataset2 k1     k2
2 -1.139759 -1.378362  Y  alpha


Key1 = Y Key2 = beta
   dataset1  dataset2 k1    k2
3 -0.617664 -0.105714  Y  beta


Key1 = Z Key2 = alpha
   dataset1  dataset2 k1     k2
4 -0.573748  0.409242  Z  alpha




In [14]:
# A possibly useful tactic is creating a dictionary of the data pieces 
group_dict = dict(list(dframe.groupby('k1')))

#Show the group with X
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.123544,1.924614,X,alpha
1,-1.448666,0.477115,X,beta


In [15]:
# We could have also chosen to do this with axis = 1

# Let's creat a dictionary for dtypes of objects!
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

#show
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -0.123544  1.924614
 1 -1.448666  0.477115
 2 -1.139759 -1.378362
 3 -0.617664 -0.105714
 4 -0.573748  0.409242, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [16]:
# Next we'll learn how to use groupby with columns

In [18]:
# For example if we only wanted to group the dataset2 column with both sets of keys
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,1.924614
X,beta,0.477115
Y,alpha,-1.378362
Y,beta,-0.105714
Z,alpha,0.409242


In [19]:
#Next we'll have a quick lesson on grouping with dictionaries and series!

<pandas.core.groupby.DataFrameGroupBy object at 0x000000000AFEF908>

# Notes

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
dframe = DataFrame({'k1': ['X', 'X', 'Y', 'Y', 'Z'],
                    'k2': ['alpha', 'beta', 'alpha', 'beta', 'alpha'],
                    'dataset1': np.random.randn(5),
                    'dataset2': np.random.randn(5)})
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,0.147817,-0.069499
1,X,beta,1.045045,0.755254
2,Y,alpha,-2.677353,0.374413
3,Y,beta,0.267743,-0.559807
4,Z,alpha,-0.255173,-0.391034


In [4]:
group1 = dframe['dataset1'].groupby(dframe['k1'])
group1

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x1275198d0>

In [5]:
group1.mean()

k1
X    0.596431
Y   -1.204805
Z   -0.255173
Name: dataset1, dtype: float64

In [6]:
cities = np.array(['NY', 'LA', 'LA', 'NY', 'NY'])

month = np.array(['JAN', 'FEB', 'JAN', 'FEB', 'JAN'])

In [7]:
dframe['dataset1'].groupby([cities, month]).mean()

LA  FEB    1.045045
    JAN   -2.677353
NY  FEB    0.267743
    JAN   -0.053678
Name: dataset1, dtype: float64

In [8]:
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,0.147817,-0.069499
1,X,beta,1.045045,0.755254
2,Y,alpha,-2.677353,0.374413
3,Y,beta,0.267743,-0.559807
4,Z,alpha,-0.255173,-0.391034


In [9]:
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.596431,0.342878
Y,-1.204805,-0.092697
Z,-0.255173,-0.391034


In [10]:
dframe.groupby(['k1', 'k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,0.147817,-0.069499
X,beta,1.045045,0.755254
Y,alpha,-2.677353,0.374413
Y,beta,0.267743,-0.559807
Z,alpha,-0.255173,-0.391034


In [11]:
dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [14]:
for name, group in dframe.groupby('k1'):
    print("This is the {} group.".format(name))
    print(group)
    print('\n')

This is the X group.
  k1     k2  dataset1  dataset2
0  X  alpha  0.147817 -0.069499
1  X   beta  1.045045  0.755254


This is the Y group.
  k1     k2  dataset1  dataset2
2  Y  alpha -2.677353  0.374413
3  Y   beta  0.267743 -0.559807


This is the Z group.
  k1     k2  dataset1  dataset2
4  Z  alpha -0.255173 -0.391034




In [15]:
for (k1, k2), group in dframe.groupby(['k1', 'k2']):
    print("Key1 = %s   Key2 = %s" % (k1, k2))
    print(group)
    print('\n')

Key1 = X   Key2 = alpha
  k1     k2  dataset1  dataset2
0  X  alpha  0.147817 -0.069499


Key1 = X   Key2 = beta
  k1    k2  dataset1  dataset2
1  X  beta  1.045045  0.755254


Key1 = Y   Key2 = alpha
  k1     k2  dataset1  dataset2
2  Y  alpha -2.677353  0.374413


Key1 = Y   Key2 = beta
  k1    k2  dataset1  dataset2
3  Y  beta  0.267743 -0.559807


Key1 = Z   Key2 = alpha
  k1     k2  dataset1  dataset2
4  Z  alpha -0.255173 -0.391034




In [16]:
group_dict = dict(list(dframe.groupby('k1')))
group_dict

{'X':   k1     k2  dataset1  dataset2
 0  X  alpha  0.147817 -0.069499
 1  X   beta  1.045045  0.755254, 'Y':   k1     k2  dataset1  dataset2
 2  Y  alpha -2.677353  0.374413
 3  Y   beta  0.267743 -0.559807, 'Z':   k1     k2  dataset1  dataset2
 4  Z  alpha -0.255173 -0.391034}

In [17]:
group_dict['X']

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,0.147817,-0.069499
1,X,beta,1.045045,0.755254


In [44]:
# separate by the types of data
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes, axis=1)))
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  0.147817 -0.069499
 1  1.045045  0.755254
 2 -2.677353  0.374413
 3  0.267743 -0.559807
 4 -0.255173 -0.391034, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [47]:
dataset2_group = dframe.groupby(['k1', 'k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-0.069499
X,beta,0.755254
Y,alpha,0.374413
Y,beta,-0.559807
Z,alpha,-0.391034
