In [212]:
import pandas as pd
import numpy as np

## 数据结构

### Series

In [213]:
obj = pd.Series([4, 7, -5, 3])
print(obj) #左边是索引，右边是值
print(obj.values) #值
print(obj.index) #索引

obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
print(obj2)
print(obj2.index)

print(obj2['a']) #通过索引取值
obj2['d'] = 6 #通过索引赋值
print(obj2[['c', 'a', 'd']]) #通过索引列表取值

0    4
1    7
2   -5
3    3
dtype: int64
[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)
d    4
b    7
a   -5
c    3
dtype: int64
Index(['d', 'b', 'a', 'c'], dtype='object')
-5
c    3
a   -5
d    6
dtype: int64


In [214]:
print(obj2[obj2 > 0]) #布尔型索引
print(obj2 * 2) #算术运算
print(np.exp(obj2)) #numpy函数

print('b' in obj2) #检查索引是否存在

sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} #字典创建Series
obj3 = pd.Series(sdata)
print(obj3)

d    6
b    7
c    3
dtype: int64
d    12
b    14
a   -10
c     6
dtype: int64
d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
True
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64


In [215]:
states = ['California', 'Ohio', 'Oregon', 'Texas'] #指定索引创建Series
obj4 = pd.Series(sdata, index=states)
print(obj4) #缺失值为NaN
print(pd.isnull(obj4)) #检查缺失值
print(pd.notnull(obj4)) #检查非缺失值
print(obj3 + obj4) #算术运算，索引对齐

obj4.name = 'population' #设置name属性
obj4.index.name = 'state' #设置index的name属性
print(obj4)

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


In [216]:

obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan'] #修改索引
print(obj)

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64


### DataFrame

In [217]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
		'year': [2000, 2001, 2002, 2001, 2002, 2003],
		'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data) #字典创建DataFrame
print(frame)
print(frame.head()) #前五行
print(frame.tail(3)) #后3行

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2
    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
    state  year  pop
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [218]:
print(pd.DataFrame(data, columns=['year', 'state', 'pop'])) #指定列顺序
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five', 'six']) #指定列顺序和索引
print(frame2)

print(frame.columns) #列名

   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
5  2003  Nevada  3.2
       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN
Index(['state', 'year', 'pop'], dtype='object')


In [219]:
print(frame2['state']) #取列
print(frame2.year) #取列

print(frame2.loc['three']) #按标签取行

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object
one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object


In [220]:
frame2['debt'] = 16.5 #赋值
print(frame2)
frame2['debt'] = np.arange(6.)
print(frame2)

       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5
six    2003  Nevada  3.2  16.5
       year   state  pop  debt
one    2000    Ohio  1.5   0.0
two    2001    Ohio  1.7   1.0
three  2002    Ohio  3.6   2.0
four   2001  Nevada  2.4   3.0
five   2002  Nevada  2.9   4.0
six    2003  Nevada  3.2   5.0


In [221]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val #按索引赋值
print(frame2)

frame2['eastern'] = frame2.state == 'Ohio' #布尔型列
print(frame2)
del frame2['eastern'] #删除列
print(frame2.columns) #列名

       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7
six    2003  Nevada  3.2   NaN
       year   state  pop  debt  eastern
one    2000    Ohio  1.5   NaN     True
two    2001    Ohio  1.7  -1.2     True
three  2002    Ohio  3.6   NaN     True
four   2001  Nevada  2.4  -1.5    False
five   2002  Nevada  2.9  -1.7    False
six    2003  Nevada  3.2   NaN    False
Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [222]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
	   'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop) #嵌套字典创建DataFrame
print(frame3)
print(frame3.T) #转置

pdata = {'Ohio': frame3['Ohio'][:-1], 'Nevada': frame3['Nevada'][:2]} #通过列创建DataFrame
print(pd.DataFrame(pdata))

frame3.index.name = 'year' #设置index的name属性
frame3.columns.name = 'state' #设置columns的name属性
print(frame3)

print(frame3.values) #二维ndarray
print(frame3.values.dtype) #值的类型
print(frame2.values) #二维ndarray，类型不同会转换成object
print(frame2.values.dtype) #值的类型

      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2000     NaN   1.5
        2001  2002  2000
Nevada   2.4   2.9   NaN
Ohio     1.7   3.6   1.5
      Ohio  Nevada
2001   1.7     2.4
2002   3.6     2.9
state  Nevada  Ohio
year               
2001      2.4   1.7
2002      2.9   3.6
2000      NaN   1.5
[[2.4 1.7]
 [2.9 3.6]
 [nan 1.5]]
float64
[[2000 'Ohio' 1.5 nan]
 [2001 'Ohio' 1.7 -1.2]
 [2002 'Ohio' 3.6 nan]
 [2001 'Nevada' 2.4 -1.5]
 [2002 'Nevada' 2.9 -1.7]
 [2003 'Nevada' 3.2 nan]]
object


DataFrame构造函数的有效输入可以有：

- 2D ndarray：数据的矩阵，行和列的标签是可选参数
- 数组、列表和元组构成的字典：每个序列成为DataFrame的一列，所有序列必须长度相等
- NumPy结构化/记录化数组：与数组构成的字典一致
- Series构成的字典：每个值成为一列，每个Series的索引联合起来 形成结果的行索引，也可以显式地传递序列
- 字典构成的字典：每个内部字典成为一列，键联合起来形成结果的行索引
- 字典或Series构成的列表：列表中的一个元素形成DataFrame的一行，字典键或Series索引联合起来形成DataFrame的列标签
- 列表或元组构成的列表：与二维数组的情况一致
- 其他DataFrame：如果不显示传递索引，则会使用原DatraFrame的索引
- NumPy MaskedArray：和二维数组的情况类似但隐藏制会在结果中成为缺失值

### 索引对象

In [223]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
print(obj)
print(index)
print(index[1:]) #索引切片
print(index[1]) #单个索引
# index[1] = 'd' #索引不可修改，不变性更安全
lables = pd.Index(np.arange(3)) #创建Index对象
print(lables)
obj2 = pd.Series([1.5, -2.5, 0], index=lables)
print(obj2)
print(obj2.index is lables) #obj2的索引就是lables

a    0
b    1
c    2
dtype: int64
Index(['a', 'b', 'c'], dtype='object')
Index(['b', 'c'], dtype='object')
b
Index([0, 1, 2], dtype='int64')
0    1.5
1   -2.5
2    0.0
dtype: float64
True


In [224]:
#除了类似数组，索引对象也像一个固定大小的集合
print(frame3)
print('Ohio' in frame3.columns) #检查列名是否存在
print(2003 in frame3.index) #检查行索引是否存在
dup_lables = pd.Index(['foo', 'foo', 'bar', 'bar']) #重复索引
print(dup_lables)

state  Nevada  Ohio
year               
2001      2.4   1.7
2002      2.9   3.6
2000      NaN   1.5
True
False
Index(['foo', 'foo', 'bar', 'bar'], dtype='object')


In [225]:
del obj, obj2, obj3, obj4, frame, frame2, frame3, data, sdata, states, pop, pdata, val, index, lables, dup_lables

## 基本功能

### 重建索引

在Series中的reindex()函数：

In [226]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(obj)
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e']) #重新索引，缺失值为NaN
print(obj2)
obj3 = obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0) #重新索引，缺失值填充0
print(obj3)
obj4 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj4)
obj4 = obj4.reindex(range(6), method='ffill') #重新索引，前向填充
print(obj4)

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64
0      blue
2    purple
4    yellow
dtype: object
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object


在DataFrame中：

In [227]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])
print(frame)
frame2 = frame.reindex(['a', 'b', 'c', 'd']) #重新索引行，缺失值为NaN
print(frame2)
states = ['Texas', 'Utah', 'California']
frame3 = frame.reindex(columns=states) #重新索引列，缺失值为NaN
print(frame3)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0
   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8


reindex方法的参数：
- index：新建作为索引的序列，可以是索引实例或任意其他序列型python数据结构，索引使用时无需复制
- method：插值方式：'ffill'为前向填充，'bfill'是后向填充
- fill_value：通过重新索引引入缺失数据时使用的替代值
- limit：当前向或后向填充时，所需填充的最大尺寸间隙（以元素数量）
- tolerance：当前向或后向填充时，所需填充的不精确匹配下的最大尺寸间隙（以绝对数字距离）
- level：匹配MultiIndex级别的简单索引，否则选择子集
- copy：如果为True，即使新索引等于旧索引，也总是复制底层数据，如果时False，则在索引相同时不要复制数据

### 轴向上删除条目

In [228]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
print(obj)
new_obj = obj.drop('c') #删除指定索引
print(new_obj)
obj.drop(['d', 'c']) #删除多个索引

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
					index=['Ohio', 'Colorado', 'Utah', 'New York'],
					columns=['one', 'two', 'three', 'four'])
print(data)
data2 = data.drop(['Colorado', 'Ohio']) #删除指定行
print(data2)
print(data.drop('two', axis=1)) #删除指定列
print(data.drop(['two', 'four'], axis='columns')) #删除多列

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15
          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


In [229]:
obj.drop('c', inplace=True) #直接在obj上删除，不返回新对象
print(obj)

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64


### 索引、选择和与过滤

In [230]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj)
print(obj['b']) #通过索引取值
print(obj[1]) #通过位置取值
print(obj[2:4]) #切片取值
print(obj[['b', 'a', 'd']]) #通过索引列表取值
print(obj[[1, 3]]) #通过位置列表取值
print(obj[obj < 2]) #布尔型索引取值
obj[obj < 2] = 0 #布尔型索引赋值
print(obj)

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
1.0
1.0
c    2.0
d    3.0
dtype: float64
b    1.0
a    0.0
d    3.0
dtype: float64
b    1.0
d    3.0
dtype: float64
a    0.0
b    1.0
dtype: float64
a    0.0
b    0.0
c    2.0
d    3.0
dtype: float64


In [231]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
					index=['Ohio', 'Colorado', 'Utah', 'New York'],
					columns=['one', 'two', 'three', 'four'])
print(data)
print(data['two']) #取列
print(data[['three', 'one']]) #取多列
print(data[:2]) #切片取行

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7


In [232]:
print(data[data['three'] > 5]) #布尔型索引取行
print(data < 5) #产生布尔型DataFrame
data[data < 5] = 0 #通过布尔型DataFrame赋值
print(data)

          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False
          one  two  three  four
Ohio        0    0      0     0
Colorado    0    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


使用loc和iloc选择数据

In [233]:
print(data.loc['Colorado', ['two', 'three']])
print(data.iloc[2, [3, 0, 1]])
print(data.iloc[2])

two      5
three    6
Name: Colorado, dtype: int64
four    11
one      8
two      9
Name: Utah, dtype: int64
one       8
two       9
three    10
four     11
Name: Utah, dtype: int64


### 算术和数据对齐

两个不同的索引化对象进行算术时结果对齐数据会产生缺失值

使用特殊值填充：

In [234]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
print(df1)
print(df2)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0


直接相加：

In [235]:
print(df1 + df2)

      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN


用add方法：

In [236]:
print(df1.add(df2, fill_value=0))

      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   5.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [237]:
print(1/df1)
print(df1.rdiv(1))

       a         b         c         d
0    inf  1.000000  0.500000  0.333333
1  0.250  0.200000  0.166667  0.142857
2  0.125  0.111111  0.100000  0.090909
       a         b         c         d
0    inf  1.000000  0.500000  0.333333
1  0.250  0.200000  0.166667  0.142857
2  0.125  0.111111  0.100000  0.090909


reindex也有类似的：

In [238]:
print(df1.reindex(columns=df2.columns, fill_value=0))

     a    b     c     d  e
0  0.0  1.0   2.0   3.0  0
1  4.0  5.0   6.0   7.0  0
2  8.0  9.0  10.0  11.0  0


算术方法有：
- add, radd：加法
- sub, rsub：减法
- div, rdiv：除法
- floordiv, rfloordiv：整除
- mul, rmul：乘法
- pow, rpow：幂次方

广播机制：

In [239]:
arr = np.arange(12.).reshape((3, 4))
print(arr)
print(arr[0])
print(arr - arr[0]) #进行减法时在每一行都进行了操作

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
[0. 1. 2. 3.]
[[0. 0. 0. 0.]
 [4. 4. 4. 4.]
 [8. 8. 8. 8.]]


在DataFrame和Series间进行操作：

In [240]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texa', 'Oregon'])
series = frame.iloc[0]
print(frame)
print(series)
print(frame-series) #将Series的索引和DataFrame的列进行匹配并广播到各行

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texa    6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
          b    d    e
Utah    0.0  0.0  0.0
Ohio    3.0  3.0  3.0
Texa    6.0  6.0  6.0
Oregon  9.0  9.0  9.0


在列上广播：

In [241]:
series2 = frame['d']
print(frame)
print(series2)
print(frame.sub(series2, axis='index'))

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texa    6.0   7.0   8.0
Oregon  9.0  10.0  11.0
Utah       1.0
Ohio       4.0
Texa       7.0
Oregon    10.0
Name: d, dtype: float64
          b    d    e
Utah   -1.0  0.0  1.0
Ohio   -1.0  0.0  1.0
Texa   -1.0  0.0  1.0
Oregon -1.0  0.0  1.0
