In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
df = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})
df

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.362033,-1.39798,X,alpha
1,0.576471,0.304034,X,beta
2,0.327763,0.49265,Y,alpha
3,0.561523,0.456795,Y,beta
4,1.407548,1.455901,Z,alpha


In [15]:
# using the groupby function

# we have to specify what this the value we want to group 
# and then we figure out the column that we need to assign the group by
group1 = df['dataset1'].groupby(df['k1'])

#noticed how the groupby does not produce any output
group1

<pandas.core.groupby.SeriesGroupBy object at 0x10e2e3ba8>

In [16]:
# but we can get values for the group1 like descriptive statistics
# thus grouping it by the k1 values, we can get the mean of the dataset1
group1.mean()

k1
X    0.107219
Y    0.444643
Z    1.407548
Name: dataset1, dtype: float64

In [20]:
# using group values that are series

countries = np.array(['USA','USA','France','USA','USA'])
days = np.array(['Mon','Tues','Tues','Tues','Mon'])

# instead of passing a value in the groupby that is from the 
# actual dataframe, we can pass a series

# the caluclation is done by locating the values to its appropriate place
# for example, USA and Mon is for the first value, amd USA and Mon is also for the last value

df['dataset1'].groupby([countries,days]).mean()


France  Tues    0.327763
USA     Mon     0.522758
        Tues    0.568997
Name: dataset1, dtype: float64

In [21]:
# quicker way of using the groups in the df
# for all the columns
# look how it only includes numerical columns

df.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.107219,-0.546973
Y,0.444643,0.474722
Z,1.407548,1.455901


In [23]:
# Or multiple column names
df.groupby(['k2','k1']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k2,k1,Unnamed: 2_level_1,Unnamed: 3_level_1
alpha,X,-0.362033,-1.39798
alpha,Y,0.327763,0.49265
alpha,Z,1.407548,1.455901
beta,X,0.576471,0.304034
beta,Y,0.561523,0.456795


In [26]:
# with the gorupsize you can also include 
# the amount of values in each group
df.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [37]:
# this is the method to check the values in the df, doing a for loop
for title,group in df.groupby(['k1']):
    print('This group is {0} \n {1}, \n'.format(title, group))

This group is X 
    dataset1  dataset2 k1     k2
0 -0.362033 -1.397980  X  alpha
1  0.576471  0.304034  X   beta, 

This group is Y 
    dataset1  dataset2 k1     k2
2  0.327763  0.492650  Y  alpha
3  0.561523  0.456795  Y   beta, 

This group is Z 
    dataset1  dataset2 k1     k2
4  1.407548  1.455901  Z  alpha, 



In [38]:
# peforming the for loop through mult key values 
for title,group in df.groupby(['k1','k2']):
    print('This group is {0} \n {1}, \n'.format(title, group))
    

This group is ('X', 'alpha') 
    dataset1  dataset2 k1     k2
0 -0.362033  -1.39798  X  alpha, 

This group is ('X', 'beta') 
    dataset1  dataset2 k1    k2
1  0.576471  0.304034  X  beta, 

This group is ('Y', 'alpha') 
    dataset1  dataset2 k1     k2
2  0.327763   0.49265  Y  alpha, 

This group is ('Y', 'beta') 
    dataset1  dataset2 k1    k2
3  0.561523  0.456795  Y  beta, 

This group is ('Z', 'alpha') 
    dataset1  dataset2 k1     k2
4  1.407548  1.455901  Z  alpha, 



In [40]:
# dict with the values from the groupby funct. 
# there's a df for each key

group_dict = dict(list(dframe.groupby('k1')))
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.175911,-0.060372,X,alpha
1,1.73154,1.400206,X,beta


In [44]:
df


Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.362033,-1.39798,X,alpha
1,0.576471,0.304034,X,beta
2,0.327763,0.49265,Y,alpha
3,0.561523,0.456795,Y,beta
4,1.407548,1.455901,Z,alpha


In [43]:
# We could have also chosen to do this with axis = 1

# Let's creat a dictionary for dtypes of objects!
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  0.175911 -0.060372
 1  1.731540  1.400206
 2 -1.464363  0.174242
 3 -0.335959 -1.078095
 4 -0.123765  1.124376, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [47]:
# For example if we only wanted to group the dataset2 column with both sets of keys
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-0.060372
X,beta,1.400206
Y,alpha,0.174242
Y,beta,-1.078095
Z,alpha,1.124376


In [13]:
# quick reminder
# 2 means how many arrays, 11 means how many numbers per array, 
np.random.randn(2,11)

array([[ 1.21221637, -2.02404625,  1.47895202, -1.04869896, -0.18358592,
        -0.81716782, -1.42515506, -0.67861107, -0.25392542,  1.04347878,
        -0.67920964],
       [ 0.21202766, -0.90353809, -1.5939303 ,  0.38977993, -0.20431651,
         0.09355643,  0.59035676,  0.25183822,  0.49709711, -0.473252  ,
        -1.62459038]])