# Mod11 Ufuncs in Pandas

## Ufuncs: Index Preservation

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'1.1.4'

In [3]:
np.__version__

'1.19.4'

In [5]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [6]:
ser.values # return np array

array([6, 3, 7, 4])

In [7]:
rng = np.random.RandomState(777)
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,7,6,7,1
1,7,4,7,9
2,8,7,2,0


In [9]:
df.values

array([[7, 6, 7, 1],
       [7, 4, 7, 9],
       [8, 7, 2, 0]])

In [10]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [11]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,1096.633158,403.428793,1096.633158,2.718282
1,1096.633158,54.59815,1096.633158,8103.083928
2,2980.957987,1096.633158,7.389056,1.0


In [12]:
np.exp(df['A'])

0    1096.633158
1    1096.633158
2    2980.957987
Name: A, dtype: float64

In [13]:
np.exp(df[['B','D']])

Unnamed: 0,B,D
0,403.428793,2.718282
1,54.59815,8103.083928
2,1096.633158,1.0


In [20]:
%%timeit
np.exp(df[0:2])  # integer index 不包含最後一筆

70 µs ± 308 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [21]:
%%timeit
np.exp(df.loc[0:2]) # index 包含最後一筆

75.9 µs ± 272 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [19]:
%%timeit
np.exp(df.iloc[0:2]) # integer index 不包含最後一筆

68.5 µs ± 966 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [15]:
np.exp(df.iloc[1])

A    1096.633158
B      54.598150
C    1096.633158
D    8103.083928
Name: 1, dtype: float64

In [16]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-0.7071068,-1.0,-0.707107,0.707107
1,-0.7071068,1.224647e-16,-0.707107,0.707107
2,-2.449294e-16,-0.7071068,1.0,0.0


## UFuncs: Index Alignment

### Index alignment in Series

In [13]:
ar1 =np.array([11,22,33]);ar1

array([11, 22, 33])

In [15]:
ar2=np.arange(3,8,2);ar2

array([3, 5, 7])

In [16]:
ar1/ar2


array([3.66666667, 4.4       , 4.71428571])

In [18]:
ar3=np.arange(3,10,2);ar3

array([3, 5, 7, 9])

In [19]:
ar1/ar3  # same dimention,but diff shape.  can't broadcasting 

ValueError: operands could not be broadcast together with shapes (3,) (4,) 

In [20]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [21]:
area

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64

In [22]:
population

California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64

In [23]:
population / area
# The resulting array contains the union of indices of the two input arrays
# any missing values are filled in with NaN by default

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [24]:
density = population / area
#  cocy.return new Series

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [25]:
# check whether have null value 
density.isnull().any()

True

In [26]:
# A = pd.Series([2, 4, 6], index=[0, 1, 2])
# B = pd.Series([1, 3, 5], index=[1, 2, 3])

A = pd.Series([2, 4, 6])
B = pd.Series([1, 3, 5], index=[1, 2, 3])

In [27]:
A # Index automatically

0    2
1    4
2    6
dtype: int64

In [28]:
B

1    1
2    3
3    5
dtype: int64

In [29]:
A + B  # if key missing => NaN

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

explicit specification of the fill value for any elements in ``A`` or ``B`` that might be missing:

In [32]:
# treat missing key by fill value
A.add(B, fill_value=0)
# interger > float! Because NaN exist!

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [35]:
# if no NaN, data type remain same
C = pd.Series([2, 4, 6])
D = pd.Series([1, 3, 5])
C+D

0     3
1     7
2    11
dtype: int64

### Index alignment in DataFrame

In [36]:
rng = np.random.RandomState(42)
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,6,19
1,14,10


In [37]:
rng = np.random.RandomState(42)
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,6,3,7
1,4,6,9
2,2,6,7


In [39]:
A + B

Unnamed: 0,A,B,C
0,9.0,25.0,
1,20.0,14.0,
2,,,


In [45]:
# A.add(B,fill_value=0)
# cast to integer 
A.add(B,fill_value=0).astype('int32')

Unnamed: 0,A,B,C
0,9,25,7
1,20,14,9
2,6,2,7


fill with the mean of all values in ``A`` (computed by first stacking the rows of ``A``):

In [46]:
#  只能計算columne 直列的平均值
A.mean()

A    10.0
B    14.5
dtype: float64

In [47]:
# 攤平，多重索引
A.stack()

0  A     6
   B    19
1  A    14
   B    10
dtype: int64

In [53]:
# 先攤平取得所有數值，再計算平均
fill = A.stack().mean();fill

12.25

In [54]:
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,9.0,25.0,19.25
1,20.0,14.0,21.25
2,18.25,14.25,19.25
