# Ch03.5 Ufuncs in Pandas

## Ufuncs: Index Preservation

In [60]:
import pandas as pd
import numpy as np

In [61]:
pd.__version__

'1.5.0'

In [62]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [63]:
np.random.seed(42)
ser = pd.Series(np.random.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [64]:
ser.values

array([6, 3, 7, 4])

In [65]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [66]:
df.values

array([[6, 9, 2, 6],
       [7, 4, 3, 7],
       [7, 2, 5, 4]])

In [67]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [68]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,403.428793,8103.083928,7.389056,403.428793
1,1096.633158,54.59815,20.085537,1096.633158
2,1096.633158,7.389056,148.413159,54.59815


In [69]:
np.exp(df['A'])

0     403.428793
1    1096.633158
2    1096.633158
Name: A, dtype: float64

In [70]:
np.exp(df[['B','D']])

Unnamed: 0,B,D
0,8103.083928,403.428793
1,54.59815,1096.633158
2,7.389056,54.59815


In [71]:
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [72]:
# 不建議 請用loc or iloc
np.exp(df[0:2])

Unnamed: 0,A,B,C,D
0,403.428793,8103.083928,7.389056,403.428793
1,1096.633158,54.59815,20.085537,1096.633158


In [73]:
np.exp(df.iloc[1])

A    1096.633158
B      54.598150
C      20.085537
D    1096.633158
Name: 1, dtype: float64

In [74]:
df.iloc[1].index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [75]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


Review NumPy ufunc

In [76]:
arr=np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [77]:
np.exp(arr)

array([ 1.        ,  2.71828183,  7.3890561 , 20.08553692, 54.59815003])

In [78]:
# 符合條件才運算
ar2=np.zeros(5)
np.exp(arr,out=ar2,where=arr>3)

array([ 0.        ,  0.        ,  0.        ,  0.        , 54.59815003])

In [79]:
ar2

array([ 0.        ,  0.        ,  0.        ,  0.        , 54.59815003])

## UFuncs: Index Alignment

### Index alignment in Series

In [80]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [81]:
area

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64

In [82]:
population

California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64

In [83]:
population / area
# The resulting array contains the union of indices of the two input arrays
# any missing values are filled in with NaN by default

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [84]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])

In [85]:
A

0    2
1    4
2    6
dtype: int64

In [86]:
B

1    1
2    3
3    5
dtype: int64

In [87]:
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

explicit specification of the fill value for any elements in ``A`` or ``B`` that might be missing:

## 處裡空值的方法
##### 運算前先處理
##### dropna()，fillna()

##### 運算中處理
##### fill_value 參數 in 各種function

##### 透過機器學習模型預測空值的實際值
##### scikit-learn 套件，Imputer pre-processing

In [88]:
#遇到空值，補0後再相加
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame

In [89]:
rng = np.random.RandomState(42)
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,6,19
1,14,10


In [90]:
rng = np.random.RandomState(42)
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,6,3,7
1,4,6,9
2,2,6,7


In [91]:
A + B

Unnamed: 0,A,B,C
0,9.0,25.0,
1,20.0,14.0,
2,,,


In [96]:
%timeit A+B

890 µs ± 18.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [92]:
A.add(B)

Unnamed: 0,A,B,C
0,9.0,25.0,
1,20.0,14.0,
2,,,


In [97]:
%timeit A.add(B)

890 µs ± 16.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [93]:
B.add(A, fill_value = 0)

Unnamed: 0,A,B,C
0,9.0,25.0,7.0
1,20.0,14.0,9.0
2,6.0,2.0,7.0


In [94]:
A.add(B,fill_value=0)

Unnamed: 0,A,B,C
0,9.0,25.0,7.0
1,20.0,14.0,9.0
2,6.0,2.0,7.0


fill with the mean of all values in ``A`` (computed by first stacking the rows of ``A``):

In [95]:
A

Unnamed: 0,A,B
0,6,19
1,14,10


In [44]:
# DataFrame 以欄為主求平均值
A.mean()

A    10.0
B    14.5
dtype: float64

In [46]:
A.stack() #欄並入列

0  A     6
   B    19
1  A    14
   B    10
dtype: int64

In [55]:
A

Unnamed: 0,A,B
0,6,19
1,14,10


In [98]:
A.unstack() #列併入欄

A  0     6
   1    14
B  0    19
   1    10
dtype: int64

In [99]:
fill = A.stack().mean()
fill

12.25

In [52]:
%timeit A.stack().mean()

170 µs ± 1.49 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [50]:
fill1 = np.mean(A.values)
fill1

12.25

In [51]:
%timeit np.mean(A.values)

5.5 µs ± 32 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [35]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,9.0,25.0,19.25
1,20.0,14.0,21.25
2,18.25,14.25,19.25


### Review NumPy Operation

In [36]:
# 2d array
ar=np.array([[1,2,3],[10,20,30]]); ar

array([[ 1,  2,  3],
       [10, 20, 30]])

In [37]:
# 1d array
ar[0]

array([1, 2, 3])

subtraction between a two-dimensional array and one of its rows is applied row-wise

In [38]:
ar-ar[0]

array([[ 0,  0,  0],
       [ 9, 18, 27]])

### Pandas Operation

In Pandas, the convention similarly operates row-wise by default:

In [39]:
rng = np.random.RandomState(42)
A = rng.randint(10, size=(3, 4))
A

array([[6, 3, 7, 4],
       [6, 9, 2, 6],
       [7, 4, 3, 7]])

In [40]:
df = pd.DataFrame(A, columns=list('QRST'))
df

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [41]:
df.iloc[0]

Q    6
R    3
S    7
T    4
Name: 0, dtype: int32

In [42]:
df[0:1]

Unnamed: 0,Q,R,S,T
0,6,3,7,4


In [43]:
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,6,-5,2
2,1,1,-4,3


In [44]:
df.subtract(df.iloc[0], axis=1)

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,6,-5,2
2,1,1,-4,3
