In [10]:
import numpy as np
import pandas as pd

## 索引和选择数据

Pandas现在支持三种类型的多轴索引。\[\] .iloc 和 .loc

.loc主要是基于标签的，但也可以与布尔数组一起使用。

当索引的标签不存在时，.loc会raise KeyError。

.loc接受以下形式的输入：

1.单个标签，例如5或'a'（注意，当使用5时，.loc会在index的标签中查询而不是在序号中查询。）。

2.列表或标签数组。\['a', 'b', 'c'\]

3.带有标签的切片对象'a':'f'（注意，与通常的python切片相反，包括起始标签和结束标签）。

4.布尔数组



.iloc是基于主要的整数位置（从0到 length-1所述轴的），但也可以用布尔数组使用。 如果请求的索引器超出范围，.iloc则会引发IndexError，但允许越界索引的切片索引器除外。（这符合Python / NumPy 切片 语义）。允许的输入是：

1.一个整数，例如5。

2.整数列表或数组。\[4, 3, 0\]

3.带有整数的切片对象1:7。

4.布尔数组。

甲callable使用一个参数（调用系列，数据帧或面板）以及函数返回索引有效输出（上面的一个）。

版本0.18.1中的新功能。

有关详细信息，请参阅按位置选择， 高级索引和高级层次结构。

.loc，.iloc以及[]索引也可以接受一个callable索引器。在Select By Callable中查看更多信息。

### \[  \]

使用\[   \]选取数据是最简单的一种方式，不过功能很有限

In [11]:
dates = pd.date_range('1/1/2000', periods=8)
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])

In [13]:
df

Unnamed: 0,A,B,C,D
2000-01-01,0.049446,0.055185,0.833972,-0.279279
2000-01-02,0.185712,0.936396,0.743699,-0.595543
2000-01-03,0.930213,-0.200924,-0.008163,0.693663
2000-01-04,-0.488064,-0.211705,-0.332538,1.430802
2000-01-05,0.478782,0.641101,-1.219808,-0.281926
2000-01-06,-0.056485,-0.158214,-0.613825,-1.847829
2000-01-07,0.348791,-1.550245,0.117302,2.394435
2000-01-08,-1.526473,-1.384228,-0.459844,0.641801


In [14]:
s = df

In [15]:
s

2000-01-01    0.049446
2000-01-02    0.185712
2000-01-03    0.930213
2000-01-04   -0.488064
2000-01-05    0.478782
2000-01-06   -0.056485
2000-01-07    0.348791
2000-01-08   -1.526473
Freq: D, Name: A, dtype: float64

In [16]:
s[0]

0.04944628918035592

In [17]:
time = s.index[0]

In [18]:
time

Timestamp('2000-01-01 00:00:00', freq='D')

In [67]:
s[time]

-1.3141621092540765

\[  \]的功能不够强大，只是为了使用方便创造出来的索引方式，新手更推荐多使用iloc和loc熟悉索引后，再仔细研究\[  \]

### loc

.loc是平时使用最多的选取方法、

.loc主要是基于标签的，但也可以与布尔数组一起使用。

当索引的标签不存在时，.loc会raise KeyError。

.loc接受以下形式的输入：

1.单个标签，例如5或'a'（注意，当使用5时，.loc会在index的标签中查询而不是在序号中查询。）。

2.列表或标签数组。['a', 'b', 'c']

3.带有标签的切片对象'a':'f'（注意，与通常的python切片相反，包括起始标签和结束标签）。

4.布尔数组


In [20]:
s1 = pd.Series(np.random.randn(6),index=list('abcdef'))

In [21]:
s1

a   -0.433983
b   -0.785480
c   -0.632942
d   -0.288765
e    0.087444
f   -0.175651
dtype: float64

In [22]:
s1.loc['c']

-0.6329422926934796

In [23]:
s1["c"]

-0.6329422926934796

In [29]:
s1.loc[['c']] # 注意与上面的对比

c   -0.632942
dtype: float64

In [31]:
c = [1 ,2 ,3, 5 ,6]

In [32]:
c[0:3]

[1, 2, 3]

In [None]:
c[[0, 1, 2]]

In [27]:
s1.loc[['c', 'd']]

c   -0.632942
d   -0.288765
dtype: float64

In [35]:
s1.loc['c': 'e']

c   -0.632942
d   -0.288765
e    0.087444
dtype: float64

In [37]:
s1

a   -0.433983
b   -0.785480
c   -0.632942
d   -0.288765
e    0.087444
f   -0.175651
dtype: float64

In [38]:
s1.mean()

-0.3715629964070574

In [39]:
bol1 = s1 > s1.mean()

In [45]:
bol1

a    False
b    False
c    False
d     True
e     True
f     True
dtype: bool

In [46]:
s1.loc[bol1]

d   -0.288765
e    0.087444
f   -0.175651
dtype: float64

注意bool索引也是和标签相关的  
顺序打乱不会影响选取结果

In [48]:
bol1

a    False
b    False
c    False
d     True
e     True
f     True
dtype: bool

In [43]:
bol2 = bol1.sort_index(ascending=False)

In [78]:
bol2

f    False
e     True
d    False
c     True
b    False
a    False
dtype: bool

In [49]:
s1.loc[bol2]

d   -0.288765
e    0.087444
f   -0.175651
dtype: float64

理解了Series的loc，就很容易理解DataFrame的loc了

In [51]:
df1 = pd.DataFrame(np.random.randn(6,4),
                   index=list('abcdef'),
                   columns=list('ABCD'))

In [52]:
df1

Unnamed: 0,A,B,C,D
a,1.519875,0.868721,1.274596,1.602051
b,-0.103598,-0.32742,-2.32395,0.321233
c,-0.082128,0.01588,0.828641,-0.204232
d,-0.257375,1.50816,1.304481,1.577285
e,0.236737,-1.394788,0.067812,0.105454
f,-1.143006,-1.103428,-1.039567,-0.507735


In [82]:
df1.loc["a", :]

A    0.499998
B    0.493820
C   -0.701505
D    0.249116
Name: a, dtype: float64

In [53]:
df1.loc[:, "A"]

a    1.519875
b   -0.103598
c   -0.082128
d   -0.257375
e    0.236737
f   -1.143006
Name: A, dtype: float64

In [54]:
df1.loc["a", "A"]

1.5198745443883988

In [85]:
df1.loc[["a"], :]

Unnamed: 0,A,B,C,D
a,0.499998,0.49382,-0.701505,0.249116


In [55]:
df1.loc[:, ["A"]]

Unnamed: 0,A
a,1.519875
b,-0.103598
c,-0.082128
d,-0.257375
e,0.236737
f,-1.143006


In [58]:
df1.loc[["a", "b"], :]

Unnamed: 0,A,B,C,D
a,1.519875,0.868721,1.274596,1.602051
b,-0.103598,-0.32742,-2.32395,0.321233


DataFrame也有bool索引，并且这在数据分析中经常会用到

In [60]:
bool1 = df1.loc[:, "A"] > df1.loc[:, "A"].mean()

In [61]:
bool1

a     True
b    False
c    False
d    False
e     True
f    False
Name: A, dtype: bool

In [62]:
bool2 = df1.loc["a", :] > df1.loc["a", :].mean()

In [64]:
bool2

A     True
B    False
C    False
D     True
Name: a, dtype: bool

In [68]:
df1.loc[bool1, ["A"]]

Unnamed: 0,A
a,1.519875
e,0.236737


### iloc

了解了loc之后，iloc的理解就非常简单了，loc是基于标签进行选取，而iloc是基于序号进行选取

In [69]:
df1

Unnamed: 0,A,B,C,D
a,1.519875,0.868721,1.274596,1.602051
b,-0.103598,-0.32742,-2.32395,0.321233
c,-0.082128,0.01588,0.828641,-0.204232
d,-0.257375,1.50816,1.304481,1.577285
e,0.236737,-1.394788,0.067812,0.105454
f,-1.143006,-1.103428,-1.039567,-0.507735


In [70]:
df1.iloc[0, :]

A    1.519875
B    0.868721
C    1.274596
D    1.602051
Name: a, dtype: float64

In [71]:
df1.iloc[:, 0]

a    1.519875
b   -0.103598
c   -0.082128
d   -0.257375
e    0.236737
f   -1.143006
Name: A, dtype: float64

In [72]:
df1.iloc[:, 0:2]

Unnamed: 0,A,B
a,1.519875,0.868721
b,-0.103598,-0.32742
c,-0.082128,0.01588
d,-0.257375,1.50816
e,0.236737,-1.394788
f,-1.143006,-1.103428


In [73]:
df1.iloc[0, 0]

1.5198745443883988

In [74]:
df1.iloc[[0], :]

Unnamed: 0,A,B,C,D
a,1.519875,0.868721,1.274596,1.602051


但是需要注意，iloc无法使用带标签的布尔索引（Series）

In [77]:
bol1 = s1 > s1.mean()

In [78]:
bol1

a    False
b    False
c    False
d     True
e     True
f     True
dtype: bool

In [79]:
df1.iloc[bol1, :]

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

In [81]:
bolarray = bol1.values

In [104]:
bolarray

array([False, False,  True, False,  True, False])

In [82]:
df1.iloc[bolarray, :]

Unnamed: 0,A,B,C,D
d,-0.257375,1.50816,1.304481,1.577285
e,0.236737,-1.394788,0.067812,0.105454
f,-1.143006,-1.103428,-1.039567,-0.507735


## 数据对齐与运算

DataFrame对象之间的运算会自动在列和行标签上对齐。生成的新DataFrame列和行标签的将是并集。

In [84]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,-1.541367,-0.109087,1.015989,-0.603026
1,1.31352,1.438782,-0.985779,-0.131689
2,-0.35171,-0.547716,-0.818778,-1.663416
3,0.61442,-1.393642,1.441936,-0.689385
4,0.259786,0.770876,0.795853,-0.116128
5,0.799246,-0.361762,-0.77177,0.561859
6,-0.240567,-0.119981,0.078381,-0.376686
7,-0.753237,1.531919,-0.786221,0.759912
8,-1.49617,-1.044384,-0.194498,-0.965226
9,0.026301,-0.341067,0.117487,-1.28234


In [85]:
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
df2

Unnamed: 0,A,B,C
0,-0.244056,0.183225,-1.006056
1,-1.714378,0.649396,1.275225
2,0.385334,-0.679136,-0.498023
3,-1.186581,-0.143131,-0.169422
4,-0.299551,0.970257,0.108058
5,-1.084299,2.118636,-2.805058
6,0.10256,0.8253,0.733707


In [109]:
df + df2

Unnamed: 0,A,B,C,D
0,0.27731,1.198301,-0.691736,
1,-0.221725,-2.265799,2.664189,
2,-0.402637,0.952688,-2.197345,
3,1.434542,0.845121,-0.591081,
4,0.470982,-1.094138,0.689015,
5,1.280501,-1.156402,1.278614,
6,-1.041181,1.091867,-1.423891,
7,,,,
8,,,,
9,,,,


在DataFrame和Series之间执行操作时，默认行为是在DataFrame列上对齐Series索引，从而按行进行广播。例如

In [87]:
df

Unnamed: 0,A,B,C,D
0,-1.541367,-0.109087,1.015989,-0.603026
1,1.31352,1.438782,-0.985779,-0.131689
2,-0.35171,-0.547716,-0.818778,-1.663416
3,0.61442,-1.393642,1.441936,-0.689385
4,0.259786,0.770876,0.795853,-0.116128
5,0.799246,-0.361762,-0.77177,0.561859
6,-0.240567,-0.119981,0.078381,-0.376686
7,-0.753237,1.531919,-0.786221,0.759912
8,-1.49617,-1.044384,-0.194498,-0.965226
9,0.026301,-0.341067,0.117487,-1.28234


In [89]:
s = df.iloc[0, :]
s

A   -1.541367
B   -0.109087
C    1.015989
D   -0.603026
Name: 0, dtype: float64

In [90]:
df - s

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,2.854887,1.547868,-2.001768,0.471337
2,1.189657,-0.438629,-1.834768,-1.060391
3,2.155788,-1.284555,0.425947,-0.086359
4,1.801153,0.879963,-0.220137,0.486897
5,2.340614,-0.252675,-1.787759,1.164885
6,1.3008,-0.010894,-0.937608,0.22634
7,0.78813,1.641006,-1.80221,1.362938
8,0.045197,-0.935298,-1.210487,-0.3622
9,1.567668,-0.23198,-0.898503,-0.679314


In [None]:
如果想按照列进行广播，需要特殊处理

In [91]:
s1 = df.iloc[:, 0]

In [92]:
s1

0   -1.541367
1    1.313520
2   -0.351710
3    0.614420
4    0.259786
5    0.799246
6   -0.240567
7   -0.753237
8   -1.496170
9    0.026301
Name: A, dtype: float64

In [117]:
df - s1

Unnamed: 0,A,B,C,D,0,1,2,3,4,5,6,7,8,9
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,
8,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,


In [93]:
df.sub(s1, axis=0)

Unnamed: 0,A,B,C,D
0,0.0,1.432281,2.557357,0.938342
1,0.0,0.125262,-2.299298,-1.445209
2,0.0,-0.196006,-0.467068,-1.311706
3,0.0,-2.008062,0.827516,-1.303805
4,0.0,0.51109,0.536067,-0.375914
5,0.0,-1.161008,-1.571016,-0.237387
6,0.0,0.120587,0.318949,-0.136119
7,0.0,2.285156,-0.032984,1.513149
8,0.0,0.451786,1.301672,0.530944
9,0.0,-0.367367,0.091186,-1.30864


df - s1等价于df.sub(s1, axis=1)

或者直接用转置方法

In [95]:
(df.T - s1).T

Unnamed: 0,A,B,C,D
0,0.0,1.432281,2.557357,0.938342
1,0.0,0.125262,-2.299298,-1.445209
2,0.0,-0.196006,-0.467068,-1.311706
3,0.0,-2.008062,0.827516,-1.303805
4,0.0,0.51109,0.536067,-0.375914
5,0.0,-1.161008,-1.571016,-0.237387
6,0.0,0.120587,0.318949,-0.136119
7,0.0,2.285156,-0.032984,1.513149
8,0.0,0.451786,1.301672,0.530944
9,0.0,-0.367367,0.091186,-1.30864


与数字的运算非常简单

In [125]:
df * 5 + 2

Unnamed: 0,A,B,C,D
0,0.664407,-0.655793,5.244824,4.06167
1,8.064871,5.605038,-3.723393,15.916775
2,-5.008027,7.165503,-1.186762,-7.318808
3,9.351294,6.311297,-1.336165,0.765196
4,1.568739,-8.841091,-7.07005,-7.587445
5,-1.280836,6.867401,-1.32936,5.991779
6,2.452031,9.518283,-3.189749,2.826979
7,8.10014,5.916776,8.570249,-2.335523
8,6.062898,-5.966837,-0.917501,0.339212
9,1.6573,10.074381,3.538775,-6.067947


In [126]:
1 / df

Unnamed: 0,A,B,C,D
0,-3.743656,-1.882677,1.540916,2.425219
1,0.82442,1.386948,-0.873608,0.359279
2,-0.713468,0.96796,-1.568991,-0.536549
3,0.680152,1.159744,-1.498727,-4.049225
4,-11.593898,-0.461208,-0.551265,-0.521515
5,-1.524002,1.027242,-1.50179,1.252574
6,11.061198,0.665045,-0.963438,6.0461
7,0.819653,1.27656,0.761006,-1.153263
8,1.230649,-0.627602,-1.713796,-3.010619
9,-14.590008,0.619243,3.249338,-0.619736


In [96]:
df ** 4

Unnamed: 0,A,B,C,D
0,5.644489,0.000142,1.065507,0.132234
1,2.976776,4.285284,0.944317,0.000301
2,0.01530171,0.089996,0.449434,7.656035
3,0.1425155,3.772285,4.322985,0.225864
4,0.004554728,0.353134,0.401172,0.000182
5,0.4080585,0.017127,0.354774,0.099658
6,0.003349237,0.000207,3.8e-05,0.020133
7,0.3219044,5.507356,0.382101,0.333467
8,5.010995,1.189711,0.001431,0.867992
9,4.784806e-07,0.013532,0.000191,2.704036


In [98]:
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool)
df1

Unnamed: 0,a,b
0,True,False
1,False,True
2,True,True


In [99]:
df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool)
df2

Unnamed: 0,a,b
0,False,True
1,True,True
2,True,False


In [133]:
df1 & df2  

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [134]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [138]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False
