In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
import re
import datetime
import os.path

* 层次化索引
* 数据的联合及合并
* 重塑及转轴

## 一、层次化索引
> 层次化索引（hierarchical indexing）是pandas的一项重要功能，它使你能在一个轴上拥有多个（两个以上）索引级别。抽象点说，它使你能以低维度形式处理高维度数据。

In [2]:
data = pd.Series(np.random.randn(9),
   ...:                  index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
   ...:                         [1, 2, 3, 1, 3, 1, 2, 2, 3]])


In [3]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

数据选取

In [4]:
data['a':'b']

a  1   -1.397390
   2   -1.641580
   3   -1.552073
b  1    1.785600
   3   -0.294151
dtype: float64

In [6]:
data.loc[:,[2,3]]

a  2   -1.641580
   3   -1.552073
b  3   -0.294151
c  2    0.089667
d  2   -0.779014
   3   -0.286791
dtype: float64

数据堆叠及展开

In [8]:
data.unstack().stack()

a  1   -1.397390
   2   -1.641580
   3   -1.552073
b  1    1.785600
   3   -0.294151
c  1    0.786713
   2    0.089667
d  2   -0.779014
   3   -0.286791
dtype: float64

In [9]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
   ....:                      index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
   ....:                      columns=[['Ohio', 'Ohio', 'Colorado'],
   ....:                               ['Green', 'Red', 'Green']])

In [11]:
frame.loc['a',:]

Unnamed: 0_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Green,Red,Green
1,0,1,2
2,3,4,5


In [13]:
#创建层次索引以便复用
pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
                       names=['state', 'color'])

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

### 重排与分级排序

In [14]:
frame.swaplevel(0,1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [16]:
frame.sort_index(level = 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


### 根据级别汇总统计

In [17]:
frame.sum(level = 1)

Unnamed: 0_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Green,Red,Green
1,6,8,10
2,12,14,16


In [18]:
frame.sum(level = 1,axis = 1)

Unnamed: 0,Unnamed: 1,Green,Red
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### 列索引

In [19]:
frame1 = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
   ....:                       'c': ['one', 'one', 'one', 'two', 'two',
   ....:                             'two', 'two'],
   ....:                       'd': [0, 1, 2, 0, 1, 2, 3]})


In [20]:
frame2 = frame1.set_index('c')

In [21]:
frame2

Unnamed: 0_level_0,a,b,d
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,7,0
one,1,6,1
one,2,5,2
two,3,4,0
two,4,3,1
two,5,2,2
two,6,1,3


In [23]:
frame2.reset_index().reindex(['a','b','c','d'],axis =1)

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


## 8.2 合并数据集
> pandas对象中的数据可以通过一些方式进行合并：

>* pandas.merge可根据一个或多个键将不同DataFrame中的行连接起来。SQL或其他关系型数据库的用户对此应该会比较熟悉，因为它实现的就是数据库的join操作。
* pandas.concat可以沿着一条轴将多个对象堆叠到一起。
* 实例方法combine_first可以将重复数据编接在一起，用一个对象中的值填充另一个对象中的缺失值。

In [19]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
   ....:                     'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
   ....:                     'data2': range(3)})

In [20]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [21]:
df3 = pd.merge(df1,df2,on = 'key').set_index('key')

In [9]:
df3.sort_index()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2,0
a,4,0
a,5,0
b,0,1
b,1,1
b,6,1


In [10]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
   ....:                     'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
   ....:                     'data2': range(3)})

In [15]:
df5 = pd.merge(df3,df4,left_on = 'lkey',right_on = 'rkey',how = 'left')

In [16]:
df5

Unnamed: 0,data1,lkey,data2,rkey
0,0,b,1.0,b
1,1,b,1.0,b
2,2,a,0.0,a
3,3,c,,
4,4,a,0.0,a
5,5,a,0.0,a
6,6,b,1.0,b


多对多的合并有些不直观

In [18]:
df6 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
   ....:                     'data1': range(6)})
df7 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
   ....:                     'data2': range(5)})

In [22]:
df8 = pd.merge(df6,df7)

In [23]:
df8

Unnamed: 0,data1,key,data2
0,0,b,1
1,0,b,3
2,1,b,1
3,1,b,3
4,5,b,1
5,5,b,3
6,2,a,0
7,2,a,2
8,4,a,0
9,4,a,2
