In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.667446,-0.306873
1,a,two,1.424041,-0.491577
2,b,one,-0.282181,-0.51924
3,b,two,-1.529936,1.107258
4,a,one,1.068817,-0.581885


In [3]:
groupped = df['data1'].groupby(df['key1'])
groupped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f723c435790>

In [4]:
groupped.mean()

key1
a    1.053435
b   -0.906059
Name: data1, dtype: float64

In [6]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.868132
      two     1.424041
b     one    -0.282181
      two    -1.529936
Name: data1, dtype: float64

In [8]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.868132,1.424041
b,-0.282181,-1.529936


In [9]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2020, 2020, 2021, 2020, 2021])
df['data1'].groupby([states, years]).mean()

California  2020    1.424041
            2021   -0.282181
Ohio        2020   -0.431245
            2021    1.068817
Name: data1, dtype: float64

In [10]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.053435,-0.460112
b,-0.906059,0.294009


In [13]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.868132,-0.444379
a,two,1.424041,-0.491577
b,one,-0.282181,-0.51924
b,two,-1.529936,1.107258


In [14]:
df.groupby('key1').size()

key1
a    3
b    2
dtype: int64

In [15]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.667446 -0.306873
1    a  two  1.424041 -0.491577
4    a  one  1.068817 -0.581885
b
  key1 key2     data1     data2
2    b  one -0.282181 -0.519240
3    b  two -1.529936  1.107258


In [16]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one  0.667446 -0.306873
4    a  one  1.068817 -0.581885
('a', 'two')
  key1 key2     data1     data2
1    a  two  1.424041 -0.491577
('b', 'one')
  key1 key2     data1    data2
2    b  one -0.282181 -0.51924
('b', 'two')
  key1 key2     data1     data2
3    b  two -1.529936  1.107258


In [17]:
pieces = dict(list(df.groupby('key1')))
pieces

{'a':   key1 key2     data1     data2
 0    a  one  0.667446 -0.306873
 1    a  two  1.424041 -0.491577
 4    a  one  1.068817 -0.581885,
 'b':   key1 key2     data1     data2
 2    b  one -0.282181 -0.519240
 3    b  two -1.529936  1.107258}

In [18]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.282181,-0.51924
3,b,two,-1.529936,1.107258


In [19]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [20]:
grouped = df.groupby(df.dtypes, axis=1)

for key, group in grouped:
    print(key)
    print(group)

float64
      data1     data2
0  0.667446 -0.306873
1  1.424041 -0.491577
2 -0.282181 -0.519240
3 -1.529936  1.107258
4  1.068817 -0.581885
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [27]:
df['data1'].groupby(df['key1'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f7212e95390>

In [21]:
df.groupby('key1')['data1']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f7212ee5890>

In [22]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.444379
a,two,-0.491577
b,one,-0.51924
b,two,1.107258


In [25]:
df.groupby('key1')[['data2']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f7213165a50>

In [35]:
for a, b in df[['data2']].groupby(df['key1']):
    print(a)
    print(b)

for a, b in df[['data2']].groupby(df['key1'])['data2']:
    print(a)
    print(b)

a
      data2
0 -0.306873
1 -0.491577
4 -0.581885
b
      data2
2 -0.519240
3  1.107258
a
0   -0.306873
1   -0.491577
4   -0.581885
Name: data2, dtype: float64
b
2   -0.519240
3    1.107258
Name: data2, dtype: float64


In [36]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped.mean()

key1  key2
a     one    -0.444379
      two    -0.491577
b     one    -0.519240
      two     1.107258
Name: data2, dtype: float64

In [40]:
people = pd.DataFrame(np.random.randn(5, 5),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['Joe', 'Steve', 'Wes', 'JY', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.699597,0.707508,0.73234,0.287943,0.589898
Steve,-0.481787,1.626557,1.099465,0.259362,2.074823
Wes,0.601962,,,1.559252,0.937417
JY,0.323324,0.506834,0.931334,1.881462,0.911507
Travis,-0.835919,1.926948,-2.238896,-1.055022,0.001502


In [41]:
mapping = {'a':'red', 'b':'red', 'c':'blue',
          'd':'blue', 'e':'red', 'f':'orange'}

In [43]:
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,1.020283,1.997003
Steve,1.358827,3.219594
Wes,1.559252,1.539379
JY,2.812796,1.741665
Travis,-3.293919,1.092532


In [44]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [45]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
JY,2,3
Travis,2,3


In [57]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
2,0.323324,0.506834,0.931334,1.881462,0.911507
3,1.301559,0.707508,0.73234,1.847195,1.527315
5,-0.481787,1.626557,1.099465,0.259362,2.074823
6,-0.835919,1.926948,-2.238896,-1.055022,0.001502


In [49]:
key_list = ['one', 'one', 'one', 'two', 'two']

people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
2,two,0.323324,0.506834,0.931334,1.881462,0.911507
3,one,0.601962,0.707508,0.73234,0.287943,0.589898
5,one,-0.481787,1.626557,1.099465,0.259362,2.074823
6,two,-0.835919,1.926948,-2.238896,-1.055022,0.001502


In [61]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'KR', 'KR'],
                                    [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])

hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,KR,KR
tenor,1,3,5,1,3
0,-0.159974,-0.933109,0.371802,-0.501291,1.105041
1,0.62676,0.990488,-0.115109,0.520255,0.232616
2,0.723956,0.040794,0.908082,-0.640042,-0.206976
3,-0.820562,-0.163894,0.578518,1.340069,-0.111623


In [62]:
hier_df.groupby(level='cty', axis=1).count()

cty,KR,US
0,2,3
1,2,3
2,2,3
3,2,3
