## 聚合与分组操作

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two','one'],
                  'data1': np.random.randn(5),
                  'data2': np.random.randn(5)})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.9305,-0.47317,a,one
1,0.114664,1.723362,a,two
2,2.46755,0.86583,b,one
3,0.907677,-0.227096,b,two
4,0.363572,-1.365865,a,one


In [8]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x11ae43518>

In [9]:
grouped.mean()

key1
a   -0.150754
b    1.687613
Name: data1, dtype: float64

## 多个key 传入

In [14]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [15]:
means

key1  key2
a     one    -0.283464
      two     0.114664
b     one     2.467550
      two     0.907677
Name: data1, dtype: float64

In [17]:
# 将数据从”花括号结构“变成”表格结构“，即要将其中一层的列索引变成行索引
# 重塑和轴转向
# stack  将数据的列“旋转”为行
# unstack  将数据的行“旋转”为
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.283464,0.114664
b,2.46755,0.907677


## 分组信息包含在同一个DataFrame中

In [18]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.150754,-0.038558
b,1.687613,0.319367


In [22]:
df.groupby(['key1']).size()

key1
a    3
b    2
dtype: int64

In [20]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.283464,-0.919517
a,two,0.114664,1.723362
b,one,2.46755,0.86583
b,two,0.907677,-0.227096


In [21]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## 遍历分组

In [24]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [26]:
grouped = df.groupby(df.dtypes, axis=1)

In [29]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.930500 -0.473170
1  0.114664  1.723362
2  2.467550  0.865830
3  0.907677 -0.227096
4  0.363572 -1.365865
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


## 列的子集

In [31]:
df.groupby('key1')['data1'].mean()

key1
a   -0.150754
b    1.687613
Name: data1, dtype: float64

In [33]:
# 传递列表或数组，返回分组的DataFrame
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.919517
a,two,1.723362
b,one,0.86583
b,two,-0.227096


In [32]:
# 单个列名作为标量传递，返回分组的Series
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one    -0.919517
      two     1.723362
b     one     0.865830
      two    -0.227096
Name: data2, dtype: float64