# <center>第10章 数据聚合与分组运算</center>

## 10.1 GroupBy机制

>1. 传入自身的列时，根据列值分组没毛病。
2. 传入array或者list时，根据传入的值对行或者列分组。
3. 传入Series或者Dict或者函数时，根据index或者columns或者函数的映射结果来分组

![原理图解](https://upload-images.jianshu.io/upload_images/7178691-e5c671e09ecf94be.png?imageMogr2/auto-orient/strip|imageView2/2/w/1038/format/webp)

groupby(by=None,axis=0,level=None,as_index=True,sort=True,group_keys=True)

### 1.按照行、列、数组(Series)进行分组

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df=pd.DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.498771,0.275568
1,a,two,-0.487863,0.174512
2,b,one,0.234237,-0.326813
3,b,two,2.106805,0.637899
4,a,one,0.688315,0.526213


In [14]:
#把data1按照key1进行分组
grouped=df.groupby(df['key1'])
#grouped是GroupBy对象，实际上还没有进行任何运算
grouped
#计算组平均值
print('组平均值\n',grouped.mean())
print('组最大值\n',grouped.max())

#一次传入多个分组依据
grouped_1=df['data1'].groupby([df['key1'],df['key2']])
x=grouped_1.mean()
print(x,'\n')
print('x.unstack\n',x.unstack())

#在axis=1上进行分组
grouped_3=df.groupby(df.dtypes,axis=1)

#获取groupby的大小,类似于 df.groupby('key1').size()
df.groupby('key1').size()

  key1 key2     data1     data2
0    a  one -0.766575  0.738452
1    a  two  1.432219 -0.897534
2    b  one  1.855643  0.605376
3    b  two  0.096996  0.788249
4    a  one -0.405901 -0.783227
组平均值
          data1     data2
key1                    
a     0.086581 -0.314103
b     0.976320  0.696812
组最大值
      key2     data1     data2
key1                         
a     two  1.432219  0.738452
b     two  1.855643  0.788249
key1  key2
a     one    -0.586238
      two     1.432219
b     one     1.855643
      two     0.096996
Name: data1, dtype: float64 

x.unstack
 key2       one       two
key1                    
a    -0.586238  1.432219
b     1.855643  0.096996


key1
a    3
b    2
dtype: int64

In [9]:
#分组键为array
states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
grouped_2=df['data1'].groupby([states,years])
for m,n in grouped_2:
    print(m)
    print(n,'\n')

('California', 2005)
1   -1.028503
Name: data1, dtype: float64 

('California', 2006)
2    1.742824
Name: data1, dtype: float64 

('Ohio', 2005)
0    0.027425
3    0.406611
Name: data1, dtype: float64 

('Ohio', 2006)
4   -0.662079
Name: data1, dtype: float64 



In [55]:
#将列名作为分组键
#下面这两个在执行时'key2'没有了，因为df['key2']不是数值数据，被排除了
print(df.groupby('key1').mean())
df.groupby(['key1','key2']).mean()

         data1     data2
key1                    
a     0.124058 -1.071063
b    -1.041001 -0.676442


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.145388,-1.243313
a,two,0.662949,-0.726562
b,one,-0.649739,-1.034672
b,two,-1.432264,-0.318211


### 2.通过字典或者Series进行分组

In [2]:
people=pd.DataFrame(np.random.randn(5,5),
                    columns=['a','b','c','d','e'],
                    index=['Joe','Steve','Wes','Jim','Travis'])

people.iloc[2:3,[1,2]]=np.nan
print(people)
mapping={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
#用列名作为分组变量
by_column=people.groupby(mapping,axis=1)
by_column.sum()

#Series也有同样的功能，因为 Series 可以被看成一个固定大小的映射
obj=pd.Series(mapping)
grouped=people.groupby(obj,axis=1)
grouped.mean()
grouped.max()

               a         b         c         d         e
Joe    -1.406102  0.947403 -0.949774  1.095878  0.137591
Steve  -0.683156 -0.968864 -0.912956 -1.138996  0.277744
Wes     0.834193       NaN       NaN  1.283451 -1.566053
Jim    -1.204632  1.095238 -0.050929  0.001752  0.285862
Travis -1.170529 -0.804126 -1.111792 -0.039718 -0.417695


Unnamed: 0,blue,red
Joe,0.146104,-0.321108
Steve,-2.051952,-1.374276
Wes,1.283451,-0.73186
Jim,-0.049177,0.176468
Travis,-1.15151,-2.39235


Unnamed: 0,blue,red
Joe,0.073052,-0.107036
Steve,-1.025976,-0.458092
Wes,1.283451,-0.36593
Jim,-0.024589,0.058823
Travis,-0.575755,-0.79745


Unnamed: 0,blue,red
Joe,1.095878,0.947403
Steve,-0.912956,0.277744
Wes,1.283451,0.834193
Jim,0.001752,1.095238
Travis,-0.039718,-0.417695


### 3.使用函数分组

In [None]:
people=pd.DataFrame(np.random.randn(5,5),
                    columns=['a','b','c','d','e'],
                    index=['Joe','Steve','Wes','Jim','Travis'])
grouped=people.groupby(len)#此处相当于对people的index应用len函数再分组
for m,n in grouped:
    print(m)
    print(n)
grouped.sum()
grouped.max()

#甚至可以将函数、list、dict、Series混合使用
key_list=['one','one','one','two','two']
grouped_1=people.groupby([len,key_list])

### 4.根据索引级别分组

In [3]:
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
                                    [1,3,5,1,3]],
                                    names=['city','tenor'])

df=pd.DataFrame(np.random.randn(4,5),columns=columns)
print(df)

#在某条轴上分组,下面两个表达式是一样的
a=df.groupby(axis=1,level=0)
b=df.groupby(axis=1,level='city')
a.count()
b.count()

city         US                            JP          
tenor         1         3         5         1         3
0     -0.001180  0.133258 -1.979221 -0.697880 -0.007849
1     -1.510629  0.535590  0.739720  1.525623  0.616018
2      1.644664  0.170806 -0.456209 -0.341694 -2.500654
3     -0.803242 -0.690335  0.338538  1.150365  0.169125


city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


### 5.对分组进行迭代

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pprint

In [None]:
df=pd.DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
df

In [29]:
#迭代返回相应的键值和键值对应的DataFrame
for name,group in df.groupby('key1'):
    print(name)
    print('name的类型是 %s'%type(name),'\n')
    print(group)
    print('group的类型是%s'%type(group),'\n')
    
#或者这样
x=df.groupby('key1')
pprint.pprint(list(x))
print('\n\n')

#有多重键的情况，返回键值组成的元组
print('有多重键的情况:\n')
for m,n in df.groupby(['key1','key2']):
    print(m)
    print(n,'\n')


a
name的类型是 <class 'str'> 

  key1 key2     data1     data2
0    a  one -0.884306  0.160559
1    a  two  1.675350  1.470012
4    a  one -0.576879 -0.167758
group的类型是<class 'pandas.core.frame.DataFrame'> 

b
name的类型是 <class 'str'> 

  key1 key2     data1     data2
2    b  one -0.926471  0.011493
3    b  two -0.682429  0.090910
group的类型是<class 'pandas.core.frame.DataFrame'> 

[('a',
    key1 key2     data1     data2
0    a  one -0.884306  0.160559
1    a  two  1.675350  1.470012
4    a  one -0.576879 -0.167758),
 ('b',
    key1 key2     data1     data2
2    b  one -0.926471  0.011493
3    b  two -0.682429  0.090910)]



有多重键的情况:

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.884306  0.160559
4    a  one -0.576879 -0.167758 

('a', 'two')
  key1 key2    data1     data2
1    a  two  1.67535  1.470012 

('b', 'one')
  key1 key2     data1     data2
2    b  one -0.926471  0.011493 

('b', 'two')
  key1 key2     data1    data2
3    b  two -0.682429  0.09091 



In [24]:
#把分组结果做成字典
a=dict(list(df.groupby(['key1','key2'])))
for x in a:
    print('key is:',x)
    print('value is:\n',a[x],'\n')

key is: ('a', 'one')
value is:
   key1 key2     data1     data2
0    a  one -0.495113 -0.280924
4    a  one -1.876972 -0.530198 

key is: ('a', 'two')
value is:
   key1 key2     data1    data2
1    a  two  0.599581 -0.26171 

key is: ('b', 'one')
value is:
   key1 key2     data1     data2
2    b  one  1.804314  0.980118 

key is: ('b', 'two')
value is:
   key1 key2     data1     data2
3    b  two -1.031238 -0.441783 



### 6.选取分组结果的列（语法糖）

In [33]:
#这种索引操作返回的是一个已经分组的DataFrame或者Series
a=df.groupby('key1')['data1']
b=df.groupby('key1')[['data1','data2']]
c=df.groupby(['key1','key2'])['data1']
d=df.groupby(['key1','key2'])[['data1','data2']]
a.mean()
b.mean()
c.mean()
d.mean()

#等价的语法
a1=df['data1'].groupby(df['key1'])
b1=df[['data1','data2']].groupby(df['key1'])
c1=df['data1'].groupby(df['key1'])
d1=df[['data1','data2']].groupby([df['key1'],df['key2']])
#好像这个语法不行 df['data1'].groupby(df[['key1','key2']])

## 10.2 数据聚合

>数据聚合指的是从数组产生标量值的过程,mean count min sum median std var prod都是这种

### 1.groupby常用聚合函数

In [None]:
import pandas as pd
import numpy as np 

In [38]:
df=pd.DataFrame({'data1':np.random.randn(5),
                'data2':np.random.rand(5),
                'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one']})
print(df)

#常见的聚合方法
grouped=df.groupby('key1')
print(grouped.mean(),'\n')
print(grouped.sum(),'\n')
print(grouped.count(),'\n')
print(grouped.quantile(0.5),'\n')

#传入自己的聚合函数,使用aggregate或者agg方法
def diff(g):
    return g.max()-g.min()

print(grouped.agg(diff))

#有些方式并非聚合运算，但是也可以用在这儿
grouped.describe()

      data1     data2 key1 key2
0  0.060368  0.368744    a  one
1  0.039828  0.666181    a  two
2 -2.242163  0.158089    b  one
3  0.439285  0.654664    b  two
4  1.299933  0.500080    a  one
         data1     data2
key1                    
a     0.466710  0.511668
b    -0.901439  0.406377 

         data1     data2
key1                    
a     1.400129  1.535005
b    -1.802878  0.812753 

      data1  data2  key2
key1                    
a         3      3     3
b         2      2     2 

0.5      data1     data2
key1                    
a     0.060368  0.500080
b    -0.901439  0.406377 

         data1     data2
key1                    
a     1.260106  0.297438
b     2.681448  0.496574


Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.46671,0.721666,0.039828,0.050098,0.060368,0.680151,1.299933,3.0,0.511668,0.149057,0.368744,0.434412,0.50008,0.583131,0.666181
b,2.0,-0.901439,1.89607,-2.242163,-1.571801,-0.901439,-0.231077,0.439285,2.0,0.406377,0.351131,0.158089,0.282233,0.406377,0.53052,0.654664


### 2.面向列的多函数应用

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [5]:
path='D:\\文档\\Python Scripts\\data_test\\tips.csv'
tips=pd.read_csv(path)
tips['tip_pct']=tips['tip']/tips['total_bill']
tips.head(5)

grouped=tips.groupby(['day','smoker'])
grouped_pct=grouped['tip_pct']

#对于统计描述，可以将函数以字符串或者list的形式传入
grouped_pct.agg('mean')
grouped_pct.agg(['mean','var'])
grouped['tip','size'].agg(['mean','var'])

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,var
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.000791
Fri,Yes,0.174783,0.002631
Sat,No,0.158048,0.001581
Sat,Yes,0.147906,0.003767
Sun,No,0.160113,0.001793
Sun,Yes,0.18725,0.023757
Thur,No,0.160298,0.001503
Thur,Yes,0.163863,0.001551


Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,size,size
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,var,mean,var
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,2.8125,0.807292,2.25,0.25
Fri,Yes,2.714,1.161369,2.066667,0.352381
Sat,No,3.102889,2.696453,2.555556,0.616162
Sat,Yes,2.875476,2.658791,2.47619,0.743322
Sun,No,3.167895,1.500099,2.929825,1.066416
Sun,Yes,3.516842,1.590501,2.578947,0.812865
Thur,No,2.673778,1.645997,2.488889,1.391919
Thur,Yes,3.03,1.239863,2.352941,0.492647


In [None]:
#可以传入带有自定义名称的一组元组
#相当于把mean的结果命名为'Durchschnitt',把np.var的结果命名为'Abweichung'
ftuples=[('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)

#当然传入字典也是可以的
#但是这样好像会报错 grouped.agg(d)
#大概知道为什么了，如果不在前面传入你要聚合的列，就认为你传入的dict的key是要处理的列
d={'Durch':'mean','Abwei':np.var}
grouped['tip_pct','total_bill'].agg(d)


#对不同的列用不同的方法
d1={'tip':['mean','std'],'total_bill':np.var}
grouped.agg(d1)

## 10.3 apply：一般性的'拆分-应用-合并'

>agg和apply的区别是agg要求元函数返回的是一个一维的，而apply返回的可以是多维的

![apply的原理](https://upload-images.jianshu.io/upload_images/7178691-7e8bb217f599b4ae.png)

### 1.apply基本使用方法

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [3]:
path='D:\\文档\\Python Scripts\\data_test\\tips.csv'
tips=pd.read_csv(path)

#根据分组选出最高的5个tip_pct值
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column).iloc[-n:]

top(tips,n=6)
#对smoker分组并应用该函数
#过程大概是top函数在DataFrame的各个片段上调用，然后结果由pandas.concat组装到一起
tips.groupby('smoker',as_index=True).apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


### 2.分位数和桶分析

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
frame=pd.DataFrame({'data1':np.random.randn(1000),
                    'data2':np.random.randn(1000)})
quantile=pd.cut(frame['data1'],bins=4)
print(quantile.head(5))

#由cut返回的Categorical对象可以直接传递到groupby
#或者这个函数返回dict也是可以的
def get_stats(group):
    return pd.Series({'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()})

frame['data1'].groupby(quantile).apply(get_stats)
print("\n>>>frame['data1'].groupby(quantile).apply(get_stats).unstack(level=1)")
print(frame['data1'].groupby(quantile).apply(get_stats).unstack(level=1))

#要得到样本分位数相同的桶，用pd.qcut即可
grouping=pd.qcut(frame.data1,q=10)
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack(level=1)

0    (-1.809, -0.238]
1     (-0.238, 1.332]
2     (-0.238, 1.332]
3     (-0.238, 1.332]
4     (-0.238, 1.332]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.385, -1.809] < (-1.809, -0.238] < (-0.238, 1.332] < (1.332, 2.902]]


data1                  
(-3.385, -1.809]  min       -3.379046
                  max       -1.826145
                  count     34.000000
                  mean      -2.281180
(-1.809, -0.238]  min       -1.800757
                  max       -0.239578
                  count    361.000000
                  mean      -0.820957
(-0.238, 1.332]   min       -0.237275
                  max        1.325599
                  count    524.000000
                  mean       0.465515
(1.332, 2.902]    min        1.338244
                  max        2.902329
                  count     81.000000
                  mean       1.814819
Name: data1, dtype: float64


>>>frame['data1'].groupby(quantile).apply(get_stats).unstack(level=1)
                       min       max  count      mean
data1                                                
(-3.385, -1.809] -3.379046 -1.826145   34.0 -2.281180
(-1.809, -0.238] -1.800757 -0.239578  361.0 -0.820957
(-0.238, 1.332]  -0.237275  1.325599  524.0  0.465515
(1.332, 2.902]    1.338244  2.902329   81.0  1.814819


Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.38, -1.27]",-2.057813,2.585203,100.0,0.083293
"(-1.27, -0.796]",-1.88878,3.2311,100.0,0.14925
"(-0.796, -0.473]",-2.646638,2.717312,100.0,-0.252731
"(-0.473, -0.225]",-1.909479,2.022722,100.0,0.088354
"(-0.225, 0.0483]",-2.914647,2.01566,100.0,-0.038057
"(0.0483, 0.295]",-2.464125,2.442952,100.0,-0.084027
"(0.295, 0.52]",-2.399404,1.96896,100.0,-0.167024
"(0.52, 0.841]",-2.441655,2.030577,100.0,0.137528
"(0.841, 1.256]",-2.278785,3.958332,100.0,0.016702
"(1.256, 2.902]",-3.177443,2.662122,100.0,0.043487


### 3.应用:用于特定分组的值填充

In [47]:
import numpy as np
import pandas as pd

s=pd.Series(np.random.randn(6))
s[::2]=np.nan
print(s)
s.fillna(s.mean())

#对不同的分组填充不同的值
states=['Ohio','New York','Vermont','Florida',
        'Oregon','Nevada','California','Idaho']
group_key=['East']*4+['West']*4
data=pd.Series(np.random.randn(8),index=states)
data[['Ohio','Nevada','California']]=np.nan
print(data,'\n')
x=data.groupby(group_key).mean()
print(x)
fill_method=lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_method)
#另外，可以在代码中预定义各组的填充值
fill_values={"East":0.5,"West":-1}
fill_method=lambda x:x.fillna(fill_values[x.name])
data.groupby(group_key).apply(fill_method)


0         NaN
1    1.763458
2         NaN
3    0.386649
4         NaN
5   -0.350409
dtype: float64
Ohio               NaN
New York     -0.716705
Vermont      -0.315925
Florida      -0.252956
Oregon        0.181702
Nevada             NaN
California         NaN
Idaho         0.593452
dtype: float64
East   -0.428529
West    0.387577
dtype: float64


Ohio          0.500000
New York     -0.716705
Vermont      -0.315925
Florida      -0.252956
Oregon        0.181702
Nevada       -1.000000
California   -1.000000
Idaho         0.593452
dtype: float64

### 4.应用：随机抽样

In [50]:
import pandas as pd
import numpy as np 

#抽牌游戏
suits=['H','S','C','D']
card_val=(list(range(1,11))+[10]*3)*4
base_names=['A']+list(range(2,11))+['J','Q','K']
card=[]
for x in suits:
    card.extend(str(num)+x for num in base_names)

deck=pd.Series(card_val,index=card)
print(deck,'\n')

#假设要从每种花色中随机抽取2张牌,定义抽样函数
def sampling(x,n=5):
    return x.sample(n=n,replace=True)#重复抽样

get_suit=lambda x:x[-1]
deck.groupby(get_suit).apply(sampling)
#如果函数要传参数
#as_index是对诸如tips.groupby('sex',as_index=False).mean()这样的结果
#group_keys: When calling apply, add group keys to index to identify pieces.
deck.groupby(get_suit,group_keys=True).apply(sampling,n=3)


AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
QH     10
KH     10
AS      1
2S      2
3S      3
4S      4
5S      5
6S      6
7S      7
8S      8
9S      9
10S    10
JS     10
QS     10
KS     10
AC      1
2C      2
3C      3
4C      4
5C      5
6C      6
7C      7
8C      8
9C      9
10C    10
JC     10
QC     10
KC     10
AD      1
2D      2
3D      3
4D      4
5D      5
6D      6
7D      7
8D      8
9D      9
10D    10
JD     10
QD     10
KD     10
dtype: int64 



C  4C     4
   8C     8
   6C     6
D  QD    10
   9D     9
   JD    10
H  AH     1
   KH    10
   7H     7
S  KS    10
   7S     7
   5S     5
dtype: int64

### 5.示例：分组加权平均数和相关系数

In [None]:
import pandas as pd
import numpy as np

#利用category计算分组加权平均数
df=pd.DataFrame({'category':['a']*4+['b']*4,
                'data':np.random.randn(8),
                'weights':np.random.rand(8)})
grouped=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)

## 10.4 透视表和交叉表

### 1.透视表(pivot table)

>将原有的DataFrame的列分别作为行索引和列索引，然后对指定的列应用聚合函数.感觉像是 DataFrame.sort_values 和 set_index 的结合
>tips.pivot_table(values,index,columns,aggfunc,margins)

In [5]:
import pandas as pd
import numpy as np

path='D:\\文档\\Python Scripts\\data_test\\tips.csv'
tips=pd.read_csv(path)

In [7]:
#然后sex和time因为不是数字，好像就没了
#这边显示的值好像就是分组平均值
#tips.pivot_table(values,index,columns,aggfunc,margins)
#values表格的值可以是list,index行分组,columns列分组,aggfunc默认是mean(可以是对groupby有效的任何函数),margins=False默认不添加分项小计)
tips.pivot_table(values=['size','tip_pct'],index=['day','smoker'],columns='sex',margins=True)
tips.pivot_table(values=['size','tip_pct'],index=['day','smoker'],columns='sex',margins=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,sex,Female,Male,All,Female,Male,All
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,2.5,2.0,2.25,0.165296,0.138005,0.15165
Fri,Yes,2.0,2.125,2.066667,0.209129,0.14473,0.174783
Sat,No,2.307692,2.65625,2.555556,0.147993,0.162132,0.158048
Sat,Yes,2.2,2.62963,2.47619,0.163817,0.139067,0.147906
Sun,No,3.071429,2.883721,2.929825,0.16571,0.158291,0.160113
Sun,Yes,2.5,2.6,2.578947,0.237075,0.173964,0.18725
Thur,No,2.48,2.5,2.488889,0.155971,0.165706,0.160298
Thur,Yes,2.428571,2.3,2.352941,0.163073,0.164417,0.163863
All,,2.45977,2.630573,2.569672,0.166491,0.157651,0.160803


Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,sex,Female,Male,Female,Male
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,2.5,2.0,0.165296,0.138005
Fri,Yes,2.0,2.125,0.209129,0.14473
Sat,No,2.307692,2.65625,0.147993,0.162132
Sat,Yes,2.2,2.62963,0.163817,0.139067
Sun,No,3.071429,2.883721,0.16571,0.158291
Sun,Yes,2.5,2.6,0.237075,0.173964
Thur,No,2.48,2.5,0.155971,0.165706
Thur,Yes,2.428571,2.3,0.163073,0.164417


### 2.交叉表(crosstab) --一种用于计算分组'频率'的特殊透视表

>crosstab(index,columns,values=None)

In [4]:
import pandas as pd
import numpy as np

data=pd.DataFrame({'Sample':[1]*10,
                'Nationality':['USA','Japan','USA','Japan','Japan','Japan','USA','USA','Japan','USA'],
                'Handedness':['right','left','right','left','left','right','left','right','left','left']})

#比如说根据国籍和用手习惯来统计汇总(不传入values)
help(pd.crosstab)
pd.crosstab(data['Nationality'],data['Handedness'],margins=True)
#我觉得和这个差不多
grouped=data['Sample'].groupby([data.Nationality,data.Handedness])
grouped.sum()

Help on function crosstab in module pandas.core.reshape.pivot:

crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False)
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed
    
    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors.
        Requires `aggfunc` be specified.
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    aggfunc : function, optional
 

Nationality  Handedness
Japan        left          4
             right         1
USA          left          2
             right         3
Name: Sample, dtype: int64