### Pandas中的数据操作

### 通用函数操作

In [2]:
import pandas as pd
import numpy as np

In [12]:
rng  = np.random.RandomState(21)
series = pd.Series( rng.randint(0,400,4) )
series

0    207
1    312
2    260
3     48
dtype: int32

In [27]:
columns = ['one','two','three','four','five']
index   = [1,3,5]
df      = pd.DataFrame(rng.randint(0,200,(3,5)),index = index, columns = columns)
df 

Unnamed: 0,one,two,three,four,five
1,94,108,10,151,43
3,167,94,101,140,35
5,90,31,48,146,114


In [26]:
np.random.randint(2,41,(3,2))

array([[34, 31],
       [14, 38],
       [33, 32]])

In [28]:
np.exp(series)

0     7.924242e+89
1    3.161392e+135
2    8.252115e+112
3     7.016736e+20
dtype: float64

In [30]:
df

Unnamed: 0,one,two,three,four,five
1,94,108,10,151,43
3,167,94,101,140,35
5,90,31,48,146,114


In [29]:
np.sin(df*np.pi/4)

Unnamed: 0,one,two,three,four,five
1,-1.0,6.85926e-15,1.0,-0.7071068,0.707107
3,-0.707107,-1.0,-0.7071068,7.838977e-15,0.707107
5,1.0,-0.7071068,-1.469576e-15,1.0,1.0


### Index 的对齐

### Series 中的index 对齐

In [45]:
area       = pd.Series({'Alaska': 1723337, 
                  'Texas': 695662,
                  'California': 423967},
                 name='area')

population = pd.Series({'California': 38332521, 
                        'Texas': 26448193,
                        'New York': 19651127}, 
                       name='population')

In [46]:
population/area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [48]:
area.index|population.index

Index([u'Alaska', u'California', u'New York', u'Texas'], dtype='object')

### 只有对应索引相加，其他索引位置无法直接计算

In [56]:
A = pd.Series([2,4,5,2],index = [0,1,2,4])
B = pd.Series([1,3,4,2],index = [3,5,6,0])
A + B  # 按照索引链接起来形成长链

0    4.0
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
dtype: float64

In [62]:
A.add(B,fill_value = 1) # 没有匹配的索引值 则只保持原来的值，后自动添加+1 = fill_value

0    4.0
1    5.0
2    6.0
3    2.0
4    3.0
5    4.0
6    5.0
dtype: float64

### DataFrame 中的Index 对齐

In [71]:
materiA = pd.DataFrame(rng.randint(100,200,(4,2)),index = {'A','B','C','D'} ,columns =  list('AB') )
materiA

Unnamed: 0,A,B
A,122,164
C,117,101
B,169,154
D,186,121


In [73]:
materiB = pd.DataFrame(rng.randint(100,200,(5,3)),index = {'A','B','C','D','E'} ,columns =  list('BCS') )
materiB

Unnamed: 0,B,C,S
A,154,156,146
C,126,166,101
B,188,175,185
E,130,175,154
D,138,133,198


In [74]:
materiB + materiA

Unnamed: 0,A,B,C,S
A,,318.0,,
B,,342.0,,
C,,227.0,,
D,,259.0,,
E,,,,


In [76]:
materiA.stack()

A  A    122
   B    164
C  A    117
   B    101
B  A    169
   B    154
D  A    186
   B    121
dtype: int32

In [80]:
materiA['A']

A    122
C    117
B    169
D    186
Name: A, dtype: int32

In [79]:
materiA.stack().mean()  #

141.75

In [81]:
materiA['B']

A    164
C    101
B    154
D    121
Name: B, dtype: int32

In [99]:
np.sum(materiA['B']+ materiA['A'])/8

141

In [103]:
fill  = materiA.stack().mean()
materiA.add(materiB,fill_value = fill)  # B列得到materiA和materiB的计算

Unnamed: 0,A,B,C,S
A,263.75,318.0,297.75,287.75
B,310.75,342.0,316.75,326.75
C,258.75,227.0,307.75,242.75
D,327.75,259.0,274.75,339.75
E,,271.75,316.75,295.75


In [101]:
materiA

Unnamed: 0,A,B
A,122,164
C,117,101
B,169,154
D,186,121


In [102]:
materiB

Unnamed: 0,B,C,S
A,154,156,146
C,126,166,101
B,188,175,185
E,130,175,154
D,138,133,198


### DataFrame 和Series 之间的运算。

In [108]:
A = rng.randint(10,size = (3,5)) 
A

array([[6, 5, 7, 2, 9],
       [9, 1, 5, 7, 7],
       [5, 2, 3, 5, 7]])

In [109]:
A - A[0]

array([[ 0,  0,  0,  0,  0],
       [ 3, -4, -2,  5, -2],
       [-1, -3, -4,  3, -2]])

In [133]:
df = pd.DataFrame(A, columns = list('HEXLO'))
df

Unnamed: 0,H,E,X,L,O
0,6,5,7,2,9
1,9,1,5,7,7
2,5,2,3,5,7


In [134]:
df.iloc[0]

H    6
E    5
X    7
L    2
O    9
Name: 0, dtype: int32

### 对应行数相减

In [135]:
df - df.iloc[0]  

Unnamed: 0,H,E,X,L,O
0,0,0,0,0,0
1,3,-4,-2,5,-2
2,-1,-3,-4,3,-2


### 按照列数计算 

### 每一列均减去指定的一列数据值

In [136]:
df.subtract(df['E'],axis = 0)  

Unnamed: 0,H,E,X,L,O
0,1,0,2,-3,4
1,8,0,4,6,6
2,3,0,1,3,5


In [137]:
df

Unnamed: 0,H,E,X,L,O
0,6,5,7,2,9
1,9,1,5,7,7
2,5,2,3,5,7


### 对部分数据进行计算

In [138]:
halfrow = df.iloc[0,::2] # 第1行，列数步长 = 2 取值
halfrow

H    6
X    7
O    9
Name: 0, dtype: int32

In [139]:
df - halfrow # 

Unnamed: 0,E,H,L,O,X
0,,0.0,,0.0,0.0
1,,3.0,,-2.0,-2.0
2,,-1.0,,-2.0,-4.0


In [140]:
df

Unnamed: 0,H,E,X,L,O
0,6,5,7,2,9
1,9,1,5,7,7
2,5,2,3,5,7
