In [1]:
from pandas import Series, DataFrame, MultiIndex
import pandas as pd
import numpy as np
from numpy import nan as NA

In [2]:
df = DataFrame(np.random.randn(7,3))
df.iloc[:4,1] = NA
df.iloc[:2,2] = NA
print(df)

          0         1         2
0 -0.437180       NaN       NaN
1 -0.450245       NaN       NaN
2  0.151723       NaN  0.111935
3 -0.707274       NaN  0.159460
4 -0.428655  0.278711  1.060489
5  0.534986 -0.265240  0.141418
6  0.207018  0.277816  0.230827


In [3]:
print(df.fillna(0))

          0         1         2
0 -0.437180  0.000000  0.000000
1 -0.450245  0.000000  0.000000
2  0.151723  0.000000  0.111935
3 -0.707274  0.000000  0.159460
4 -0.428655  0.278711  1.060489
5  0.534986 -0.265240  0.141418
6  0.207018  0.277816  0.230827


In [4]:
# 传一个字典，不同列填不同值
print(df.fillna({1:0.5,3:-1}))


          0         1         2
0 -0.437180  0.500000       NaN
1 -0.450245  0.500000       NaN
2  0.151723  0.500000  0.111935
3 -0.707274  0.500000  0.159460
4 -0.428655  0.278711  1.060489
5  0.534986 -0.265240  0.141418
6  0.207018  0.277816  0.230827


In [5]:
# fillna默认返回新对象，但也可以就地修改
_ = df.fillna(0, inplace=True)
print(df)

          0         1         2
0 -0.437180  0.000000  0.000000
1 -0.450245  0.000000  0.000000
2  0.151723  0.000000  0.111935
3 -0.707274  0.000000  0.159460
4 -0.428655  0.278711  1.060489
5  0.534986 -0.265240  0.141418
6  0.207018  0.277816  0.230827


In [6]:
df = DataFrame(np.random.randn(6,3))
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA
print(df)

          0         1         2
0 -0.087906  0.115722 -1.461898
1  0.771215 -0.415849 -1.848660
2  0.560959       NaN  1.124725
3  1.583438       NaN  0.135028
4  0.093211       NaN       NaN
5  0.307024       NaN       NaN


In [7]:
print(df.fillna(method='ffill'))

          0         1         2
0 -0.087906  0.115722 -1.461898
1  0.771215 -0.415849 -1.848660
2  0.560959 -0.415849  1.124725
3  1.583438 -0.415849  0.135028
4  0.093211 -0.415849  0.135028
5  0.307024 -0.415849  0.135028


In [8]:
# 最多连续填充2行
print(df.fillna(method='ffill', limit=2))

          0         1         2
0 -0.087906  0.115722 -1.461898
1  0.771215 -0.415849 -1.848660
2  0.560959 -0.415849  1.124725
3  1.583438 -0.415849  0.135028
4  0.093211       NaN  0.135028
5  0.307024       NaN  0.135028


In [9]:
# 使用均值填充
data = Series([1,NA,3.5,NA,7])
print(data.fillna(data.mean()))

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64


## 层次化索引

In [12]:
data = Series(np.random.randn(10),
              index=[['a','a','a','b','b','b','c','c','d','d'],
                     [1,2,3,1,2,3,1,2,2,3]])
print(data)

a  1   -1.032649
   2   -0.944436
   3    0.275388
b  1    0.633245
   2    0.490437
   3   -0.121062
c  1    1.026703
   2    1.406398
d  2    0.107782
   3   -0.320768
dtype: float64


In [13]:
print(data.index)

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )


In [15]:
# 选取子集
print(data['b'])
print(data['b':'c'])


1    0.633245
2    0.490437
3   -0.121062
dtype: float64
b  1    0.633245
   2    0.490437
   3   -0.121062
c  1    1.026703
   2    1.406398
dtype: float64


In [16]:
print(data.loc[['b','d']])
# 取第一层索引，再取第二层索引为2的
print(data[:,2])

b  1    0.633245
   2    0.490437
   3   -0.121062
d  2    0.107782
   3   -0.320768
dtype: float64
a   -0.944436
b    0.490437
c    1.406398
d    0.107782
dtype: float64


In [18]:
print(data.unstack())
print(data.unstack().stack())

          1         2         3
a -1.032649 -0.944436  0.275388
b  0.633245  0.490437 -0.121062
c  1.026703  1.406398       NaN
d       NaN  0.107782 -0.320768
a  1   -1.032649
   2   -0.944436
   3    0.275388
b  1    0.633245
   2    0.490437
   3   -0.121062
c  1    1.026703
   2    1.406398
d  2    0.107782
   3   -0.320768
dtype: float64


In [19]:
frame = DataFrame(np.arange(12).reshape((4,3)),
                  index=[['a','a','b','b'],[1,2,1,3]],
                  columns=[['Ohio','Ohio','Colorado'],
                           ['Green','Red','Green']])
print(frame)

     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  3     9  10       11


In [20]:
# 加索引名称
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
print(frame)

state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     3        9  10       11


In [21]:
# 选取列分组
print(frame['Ohio'])


color      Green  Red
key1 key2            
a    1         0    1
     2         3    4
b    1         6    7
     3         9   10


In [None]:
# 创建MultiIndex，以复用
MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],
                       ['Green','Red','Green']],
                       names=['state','color'])


## 重排分级顺序

In [24]:
print(frame)

state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     3        9  10       11


In [25]:
# 互换索引
print(frame.swaplevel('key1','key2'))

state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
2    a        3   4        5
1    b        6   7        8
3    b        9  10       11


In [27]:
# 先对外层索引进行排序，再对内层索引进行排序
print(frame.sort_index(1))


state     Colorado  Ohio    
color        Green Green Red
key1 key2                   
a    1           2     0   1
     2           5     3   4
b    1           8     6   7
     3          11     9  10


  print(frame.sort_index(1))


In [28]:
print(frame.swaplevel(0,1).sort_index())

state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
     b        6   7        8
2    a        3   4        5
3    b        9  10       11


## 根据级别汇总统计

In [29]:
# 指定在某条轴上求和
print(frame.sum(level='key2'))

state  Ohio     Colorado
color Green Red    Green
key2                    
1         6   8       10
2         3   4        5
3         9  10       11


  print(frame.sum(level='key2'))


In [32]:
print(frame.sum(level='color', axis=1))

color      Green  Red
key1 key2            
a    1         2    1
     2         8    4
b    1        14    7
     3        20   10


  print(frame.sum(level='color', axis=1))


## 使用DataFrame的列

In [33]:
frame = DataFrame({'a':range(7),'b':range(7,0,-1),
                   'c':['one','one','one','two','two','two','two'],
                   'd':[0,1,2,0,1,2,3]})
print(frame)

   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3


In [34]:
# 将列转为行索引
frame2 = frame.set_index(['c','d'])
print(frame2)

       a  b
c   d      
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1


In [35]:
# 默认会移除这些列，但也可以将其保留
print(frame.set_index(['c','d'],drop=False))


       a  b    c  d
c   d              
one 0  0  7  one  0
    1  1  6  one  1
    2  2  5  one  2
two 0  3  4  two  0
    1  4  3  two  1
    2  5  2  two  2
    3  6  1  two  3


In [36]:
# 将索引转到列
print(frame2.reset_index())


     c  d  a  b
0  one  0  0  7
1  one  1  1  6
2  one  2  2  5
3  two  0  3  4
4  two  1  4  3
5  two  2  5  2
6  two  3  6  1


## 整数索引

In [37]:
ser = Series(np.arange(3.))
print(ser)

0    0.0
1    1.0
2    2.0
dtype: float64


In [38]:
# 使用非整数索引
ser2 = Series(np.arange(3.),index=['a','b','c'])
print(ser2)

a    0.0
b    1.0
c    2.0
dtype: float64


In [39]:
print(ser.iloc[:1])

0    0.0
dtype: float64


In [40]:
ser3 = Series(np.arange(3.),index=[-5,1,3])
print(ser3.iloc[2])

2.0


In [41]:
frame = DataFrame(np.arange(6).reshape(3,2),
                  index=[2,0,1])
print(frame.iloc[0])

0    0
1    1
Name: 2, dtype: int32


## 面板数据

In [42]:
import pandas.io.data as web

pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk))
                       for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))

In [None]:
pdata

In [None]:
pdata = pdata.swapaxes('items', 'minor')
pdata['Adj Close']

In [None]:
pdata.ix[:, '6/1/2012', :]

In [None]:
pdata.ix['Adj Close', '5/22/2012':, :]

In [None]:
stacked = pdata.ix[:, '5/30/2012':, :].to_frame()
stacked

In [None]:
stacked.to_panel()