# <center>第8章 数据规整:聚合、合并和重塑</center>

## 8.1 层次化索引

### 1.层次化索引
- Series的层次化索引

In [3]:
import pandas as pd 
import numpy as np 

data=pd.Series(np.random.randn(9),index=[['a','a','a','b','b','c','c','d','d'],[1,2,3,1,3,1,2,2,3]])
data

#部分索引
data['b']
data[:,1]
#这样不行 data['b':'c',1],data['b','d']
data[['b','c']]
data.loc[('b','d'),:]
data
#还可以直接在内层选取
data.loc[:,2]
data.loc[['a','d'],:]
#层次化索引在数据重塑和基于分组的操作中扮演着十分重要的角色

#对于一个DataFrame，每条轴都可以有分层索引
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],\
    columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
print(frame)
frame.Ohio.Red
frame['Ohio','Red']

#层次化的索引里面需要用tuple
frame.loc[('a',1),('Ohio','Green')]
frame.loc[(('a','b'),2),('Ohio','Red')]

#各层都可以有名字
frame.index.names=['key1','key2']
frame.columns.names=['state','color']

#可以单独创建MultiIndex然后复用
a=pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
                       names=['state', 'color'])

     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  2     9  10       11


- DataFrame的层次化索引

In [None]:
import pandas as pd
import numpy as np

#对于一个DataFrame，每条轴都可以有分层索引
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],\
    columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
print(frame)
frame.Ohio.Red
frame['Ohio','Red']

#层次化的索引里面需要用tuple
frame.loc[('a',1),('Ohio','Green')]
frame.loc[(('a','b'),2),('Ohio','Red')]

#各层都可以有名字
frame.index.names=['key1','key2']
frame.columns.names=['state','color']

#可以单独创建MultiIndex然后复用
a=pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
                       names=['state', 'color'])

### 2.重排与分级排序

In [None]:
import pandas as pd 
import numpy as np

#调整某条轴上各级别的顺序
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],\
    columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame.swaplevel('key1','key2')
frame.swaplevel('color','state',axis=1)
#DataFrame.sort_index则根据单个级别中的值对数据进行排序
frame.sort_index(level=1)

#根据级别汇总统计
frame=pd.DataFrame(np.random.randn(5,5),index=[['a','b','a','f','b'],['ss','nn','ss','ff','nn']],
columns=['java','c++','julia','perl','ruby'])
frame.sum(level=1)

## 8.2 合并数据集

### 1.数据库风格的DataFrame合并

>merge(left, right, how='inner'/'outer'/'left'/'right', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None).suffixes用于重叠的列的后缀;indicator在how='outer'时，给不同来源的行进行标识

In [None]:
import pandas as pd
import numpy as np

df1=pd.DataFrame({'key':['b','b','a','c','a','a','b'],
                    'data1':range(7)})
df2=pd.DataFrame({'key':['a','b','d'],'data2':range(3)})

#how有outer、inner、right和left
pd.merge(df1,df2,on='key',how='inner')
#列名不同可以用right_on='data1'和left_on='data2'分别指定
#right和left参数是使用相应DataFrame中的所有键
#suffixes参数是在两边有列名相同时加上后缀
pd.merge(df1,df2,left_on='data1',right_on='data2',suffixes=['_a','_b'])

pd.merge(df1,df2,how='outer',on='key',indicator=True)

### 2.多对多的合并有些不直观

In [None]:
import pandas as pd
import numpy as np 

df1=pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
df2=pd.DataFrame({'key':['a','b','a','b','d'],'date2':range(5)})
pd.merge(df1,df2)
#要根据多个键进行合并，传入一个由列名组成的列表即可
left=pd.DataFrame({'key1':['foo','foo','bar'],
                    'key2':['one','two','one'],
                    'lval':[1,2,3]})
right=pd.DataFrame({'key1':['foo','foo','bar','bar'],
                    'key2':['one','one','one','two'],
                    'rval':[4,5,6,7]})
print(left,'\n',right)
pd.merge(left,right,on=['key1','key2'],how='outer')

In [None]:
#索引上的合并
#有时候，DataFrame的连接键位于其索引中
#传入left_index=True或者right_index=True就可以了
left=pd.DataFrame({'key':['a','b','a','a','b','c'],
                    'value':range(6)})
right=pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])
pd.merge(left,right,left_on='key',right_index=True)  
#DataFrame/Series还有一个便捷的join方法，能方便的实现按索引合并，但要求没有重叠的列
#也就是说，合并的2个DataFrame不能出现相同的列
help(left.join)
#按索引合并，how的参数默认为'left'
left.join(right,how='outer')
#on参数是传入left的某列和right的索引连接
left.join(right,on='key',how='inner')

### 3.Pandas的cancat函数

>merge只能连接行，cancat既可以连接行，又可以连接列

>concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=None, copy=True).axis=0默认竖直方向上连接,keys表示在index中标明数据来源,names与keys对应，对keys命名；verigy_integrity表示检查是否存在重复index，默认不检测

In [None]:
#numpy提供的轴向连接方法
import pandas as pd
import numpy as np

#numpy数组的轴向连接
arr=np.arange(12).reshape(3,4)
np.concatenate([arr,arr],axis=1)

- Series的连接

In [None]:
import pandas as pd 
import numpy as np

s1=pd.Series([0,1],index=['a','b'])
s2=pd.Series([2,3,4],index=['c','d','d'])
s3=pd.Series([5,4],index=['a','e'])

pd.concat([s1,s2])#直接在竖直方向上连接
pd.concat([s1,s2],ignore_index=True)
pd.concat([s1,s2],keys=['s1','s2'])#keys对来源进行标注
pd.concat([s1,s2],keys=['s1','s2'],names='aa')

pd.concat([s1,s3],axis=1,keys=['s1','s3'],name='aa')
#通过指定join_axes指定要在其他轴上使用的索引
pd.concat([s1,s3],axis=1,join_axes=[['a','b']],keys=['mm','aa'])


- DataFrame的连接

In [None]:
import pandas as pd
import numpy as np

a=pd.Index(['julia','python','c++'],names='language')
b=pd.Index(['a','v','f'],names='vv')
obj1=pd.DataFrame(np.random.randint(0,3,(3,3)),index=a,columns=b)
obj2=pd.DataFrame(np.random.randint(0,3,(3,3)),columns=['a','f','s'])
#这个join_axes参数真是日了狗了
pd.concat([obj1,obj2],axis=0,keys=['hello','Python'],join_axes=[pd.Index(['a','f'])])
#如果行索引不包含任何相关数据，传入ignore_index=True即可

df1=pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2=pd.DataFrame(np.random.randn(2,3),columns=['b','d','a'])

pd.concat([df1,df2],join='inner')#join='inner'表示只保留公共的列
pd.concat([df1,df2],keys=['df1','df2'],names=['julia'])

## 8.3 重塑和轴向旋转

### 1.重塑层次化索引

In [None]:
import pandas as pd
import numpy as np

data=pd.DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['ohio','colortdo'],name='state'),
columns=pd.Index(['one','two','three'],name='number'))

data.stack()
a=data.stack()
a.unstack(level=1)
a.unstack('number')
#如果不是所有的值都能在分组中找到的话，unstack操作可能会引入缺失数据
#但是务必注意index的值不能重复
s1=pd.Series([1,34,3,3],index=['a','b','c','d'])
s2=pd.Series([4,5,6],index=['c','d','e'])
data2=pd.concat([s1,s2],keys=['s1','s2'])
print(data2)
data2.unstack()
a=data2.unstack()
print(a)
a.stack()

### 2.将'长格式'旋转为'宽格式'

In [None]:
import pandas as pd 
import numpy as np
data=pd.read_csv('data_test\\macrodata.csv')
periods=pd.PeriodIndex(year=data.year,quarter=data.quarter,name='data')
columns=pd.Index(['realgdp','infl','unemp'],name='item')
data1=data.reindex(columns=columns)
#把periods转化为时间戳
data1.index=periods.to_timestamp('D','end')
periods.to_timestamp('D','start')
ldata=data1.stack().reset_index().rename(columns={0:'value'})
#DataFrame.pivot相当于stack的逆过程,但是功能更强,能用来做数据透视表
print(ldata)
#DataFrame.pivot(index=?,columns=?,values=?)
ldata.pivot(index='data',columns='item',values='value')
#如果忽略最后一个参数,得到的DataFrame就会带有层次化的列
ldata['data2']=np.random.randn(15)
ldata.pivot(index='data',columns='item')


### 3.将宽格式转换为长格式

In [None]:
#pivot的逆过程
#pd.melt(df,id_vars=作为变量的列,var_name=变量列名,value_name=值名默认为'value')
df=pd.DataFrame({'key':['foo','bar','baz'],
                    'A':[1,3,4],'B':[4,5,6],'C':[7,8,9]})
print(df)
a=pd.melt(df,['key','A'],var_name='hello')
print(a)
#使用pivot还原
#遗憾的是pivot接收的参数似乎不能是list
a.pivot(index='key',columns='hello',values='value')