In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

## 重新索引

In [4]:
obj = Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
print(obj)
# 按新索引重排
obj2 = obj.reindex(['a','b','c','d','e'])
print(obj2)
print(obj.reindex(['a','b','c','d','e'], fill_value=0))

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64


In [6]:
obj3 = Series(['blue','purple','yellow'], index=[0,2,4])
print(obj3)
# 插值处理，ffill向前值填充
print(obj3.reindex(range(6),method='ffill'))
print(obj3.reindex(range(6),method='bfill'))

0      blue
2    purple
4    yellow
dtype: object
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object


In [8]:
frame = DataFrame(np.arange(9).reshape((3,3)), 
                  index=['a','c','d'],
                  columns=['Ohio','Texas','California'])
print(frame)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8


In [9]:
# 对DataFrame，仅传一个序列，就是重排索引
frame2 = frame.reindex(['a','b','c','d'])
print(frame2)

   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0


In [10]:
# 重排列
states = ['Texas','Utah','California']
print(frame.reindex(columns=states))

   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8


In [14]:
print(frame)
# 同时重排索引和列，插值只能按轴0应用
print(frame.reindex(index=['a','b','c','d'], columns=states)
      .ffill())

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Texas  Utah  California
a    1.0   NaN         2.0
b    1.0   NaN         2.0
c    4.0   NaN         5.0
d    7.0   NaN         8.0


## 丢弃指定轴上的项

In [23]:
obj = Series(np.arange(5), index=['a','b','c','d','e'])
print(obj)
# 返回的是一个新对象，对源对象没有影响
new_obj = obj.drop('c')
print(new_obj)
print(obj.drop(['d','c']))

a    0
b    1
c    2
d    3
e    4
dtype: int32
a    0
b    1
d    3
e    4
dtype: int32
a    0
b    1
e    4
dtype: int32
a    0
b    1
c    2
d    3
e    4
dtype: int32


In [24]:
# 对DataFrame，可以删除任意轴上的索引值
# {0 or 'index', 1 or 'columns'}
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
print(data)
print(data.drop(['Colorado','Ohio']))

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15


In [25]:
print(data.drop('two',axis=1))
print(data.drop(['two','four'],axis=1))

          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15
          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


## 索引、选取和过滤

In [27]:
obj = Series(np.arange(4), index=['a','b','c','d'])
print(obj)
print(obj['b'])
print(obj[1])


a    0
b    1
c    2
d    3
dtype: int32
1
1


In [28]:
print(obj[2:4])
print(obj[['b','a','d']])

c    2
d    3
dtype: int32
b    1
a    0
d    3
dtype: int32


In [29]:
print(obj[[1,3]])
print(obj[obj<2])

b    1
d    3
dtype: int32
a    0
b    1
dtype: int32


In [30]:
# 使用标签的切片是末端是包含的
print(obj['b':'c'])

b    1
c    2
dtype: int32


In [31]:
# 设置
obj['b':'c'] = 5
print(obj)

a    0
b    5
c    5
d    3
dtype: int32


In [2]:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
print(data)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [3]:
print(data['two'])
print(data[['three','one']])

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12


In [4]:
# 切片选取行
print(data[:2])
# 布尔型数组选取行
print(data[data['three']>5])


          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [5]:
print(data<5)
# 布尔型DataFrame选取行
data[data<5] = 0
print(data)

            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False
          one  two  three  four
Ohio        0    0      0     0
Colorado    0    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [20]:
print(data.loc[['Colorado'],['two','three']])
# print(data.loc[['Colorado','Utah'],[3,0,1]])

          two  three
Colorado    5      6


In [39]:
print(data.iloc[2])
print(data.loc[:'Utah','two'])
#print(data.iloc[data.three>5,:3])

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32
Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32


In [17]:
# 根据标签选取单行或单列
print(data.xs('Ohio'))
print(data.xs('two',axis=1))

one      0
two      0
three    0
four     0
Name: Ohio, dtype: int32
Ohio         0
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32


## 算术运算和数据对齐

In [22]:
s1 = Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
print(s1)
print(s2)

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64


In [23]:
print(s1+s2)

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64


In [25]:
df1 = DataFrame(np.arange(9).reshape((3,3)),
                columns=list('bcd'),
                index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12).reshape((4,3)),
                columns=list('bde'),
                index=['Utah','Ohio','Texas','Oregon'])
print(df1)
print(df2)

          b  c  d
Ohio      0  1  2
Texas     3  4  5
Colorado  6  7  8
        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11


In [26]:
print(df1+df2)

            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


In [27]:
df1 = DataFrame(np.arange(12).reshape((3,4)),columns=list('abcd'))
df2 = DataFrame(np.arange(20).reshape((4,5)),columns=list('abcde'))
print(df1)
print(df2)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19


In [28]:
print(df1+df2)

      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0  11.0  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN


In [29]:
# df1中缺的部分用0填充
print(df1.add(df2,fill_value=0))


      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0  11.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [30]:
# 重新索引时，指定填充值
print(df1.reindex(columns=df2.columns, fill_value=0))


   a  b   c   d  e
0  0  1   2   3  0
1  4  5   6   7  0
2  8  9  10  11  0


In [31]:
arr = np.arange(12).reshape((3,4))
print(arr)
print(arr[0])

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[0 1 2 3]


In [32]:
print(arr-arr[0])

[[0 0 0 0]
 [4 4 4 4]
 [8 8 8 8]]


In [33]:

frame = DataFrame(np.arange(12).reshape((4,3)),
                columns=list('bde'),
                index=['Utah','Ohio','Texas','Oregon'])
series = frame.iloc[0]
print(frame)
print(series)

        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11
b    0
d    1
e    2
Name: Utah, dtype: int32


In [34]:
# dataframe 和 series 运算
print(frame-series)

        b  d  e
Utah    0  0  0
Ohio    3  3  3
Texas   6  6  6
Oregon  9  9  9


In [35]:
series2 = Series(range(3), index=['b','e','f'])
# 在行上广播
print(frame+series2)

          b   d     e   f
Utah    0.0 NaN   3.0 NaN
Ohio    3.0 NaN   6.0 NaN
Texas   6.0 NaN   9.0 NaN
Oregon  9.0 NaN  12.0 NaN


In [36]:
# 在列上广播
series3 = frame['d']
print(series3)
print(frame.sub(series3,axis=0))

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: int32
        b  d  e
Utah   -1  0  1
Ohio   -1  0  1
Texas  -1  0  1
Oregon -1  0  1


## 函数应用和映射

In [40]:
# numpy的ufuncs(元素级数组方法)也可也操作pandas对象
frame = DataFrame(np.random.randn(4,3),
                columns=list('bde'),
                index=['Utah','Ohio','Texas','Oregon'])
print(frame)
print(np.abs(frame))

               b         d         e
Utah   -1.757670  2.397240 -0.522192
Ohio   -0.248123 -1.151036 -0.630171
Texas  -0.734262  0.245614  0.075138
Oregon  1.894403 -0.068159 -0.503022
               b         d         e
Utah    1.757670  2.397240  0.522192
Ohio    0.248123  1.151036  0.630171
Texas   0.734262  0.245614  0.075138
Oregon  1.894403  0.068159  0.503022


In [41]:
# apply将函数应用到每行或每列
f = lambda x: x.max()-x.min()
print(frame.apply(f))
print(frame.apply(f,axis=1))

b    3.652073
d    3.548276
e    0.705310
dtype: float64
Utah      4.154910
Ohio      0.902913
Texas     0.979876
Oregon    2.397425
dtype: float64


In [42]:
# 传给apply的函数还可以返回由多个值组成的Series
def f(x):
    return Series([x.min(), x.max()], index=['min','max'])
print(frame.apply(f))

            b         d         e
min -1.757670 -1.151036 -0.630171
max  1.894403  2.397240  0.075138


In [46]:
# DataFrame的applymap作用在每个元素上
formatt = lambda x: '%.2f' % x
print(frame.applymap(formatt))

            b      d      e
Utah    -1.76   2.40  -0.52
Ohio    -0.25  -1.15  -0.63
Texas   -0.73   0.25   0.08
Oregon   1.89  -0.07  -0.50


In [47]:
#Series上的map方法
print(frame['e'].map(formatt))

Utah      -0.52
Ohio      -0.63
Texas      0.08
Oregon    -0.50
Name: e, dtype: object
