## Session 13

In [4]:
import numpy as np
import pandas as pd

In [5]:
names = ['hamid', 'sara', 'ali', 'reza']
heights = np.array([173, 156, 161, 168])
weights = np.array([76, 55, 81, 78])

In [6]:
heights[2]

161

In [9]:
heights_series = pd.Series(data=[173, 156, 161, 168], index=names)
heights_series

hamid    173
sara     156
ali      161
reza     168
dtype: int64

In [10]:
heights_series.loc['ali']

161

In [12]:
weights_series = pd.Series(data=[76, 55, 81, 78], index=names)
weights_series

hamid    76
sara     55
ali      81
reza     78
dtype: int64

### index preservation

In [13]:
# numpy preserve indexes during operations on Series and DataFrame

In [14]:
heights_series + 10

hamid    183
sara     166
ali      171
reza     178
dtype: int64

In [15]:
# calculating BMI

In [17]:
BMI = weights_series / ((heights_series*0.01)**2)
BMI

hamid    25.393431
sara     22.600263
ali      31.248794
reza     27.636054
dtype: float64

In [18]:
type(BMI)

pandas.core.series.Series

### index alignment

In [19]:
heights = np.array([173, 156, 161, 168])
weights = np.array([76, 55, 78])
weights / ((heights*0.01)**2)

ValueError: operands could not be broadcast together with shapes (3,) (4,) 

In [21]:
names = ['hamid', 'sara', 'ali', 'reza']
heights_series = pd.Series(data=[173, 156, 161, 168], index=['hamid', 'sara', 'ali', 'reza'])
weights_series = pd.Series(data=[76, 55, 78], index=['hamid', 'sara', 'reza'])

In [22]:
heights_series

hamid    173
sara     156
ali      161
reza     168
dtype: int64

In [23]:
weights_series

hamid    76
sara     55
reza     78
dtype: int64

In [24]:
weights_series / ((heights_series*0.01)**2)

ali            NaN
hamid    25.393431
reza     27.636054
sara     22.600263
dtype: float64

### missing values

In [26]:
# 2 different aspect of this problem
# first storing and showing the missing values
# second managing (placement and estimate) the missing values

In [29]:
# there are 2 methods for first aspect (storing) :
# mask
# sentinel

In [31]:
# mask method
temp = [32, 11, 0, 48, 0]
missing = [False, False, True, False, False]

In [35]:
# sentinel method
temp = [32, 11, None, 48, 0]
# note that we must use integer number for missing value if we were dealing with integer array (for example : -128)
# i.e. we can use None value if we have objects

In [43]:
# pandas uses sentinel method
# note that sentinel value depends on array data type
# for float --> NaN (can be used in caculations in contrast to None but the result will be NaN)
# for object --> None
# for int --> no sentinel value (error if missing value)
# for String --> (string is also an object)

In [38]:
a = pd.Series(data=temp, dtype='float64')
a

0    32.0
1    11.0
2     NaN
3    48.0
4     0.0
dtype: float64

In [39]:
b = pd.Series(data=temp, dtype='object')
b

0      32
1      11
2    None
3      48
4       0
dtype: object

In [41]:
c = pd.Series(data=temp, dtype='int')
c

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [46]:
d = pd.Series(data=['aaa', 'bbb', 'None', 'ccc'])
d

0     aaa
1     bbb
2    None
3     ccc
dtype: object

In [48]:
# note that if we have integer data, and don't provide data type,
# pandas automatically cast it to float to use NaN for missing values
e = pd.Series(data=[32, 11, None, 48, 0])
e

0    32.0
1    11.0
2     NaN
3    48.0
4     0.0
dtype: float64

In [50]:
# but it's not always practical to cast int to float, because int is more accurate than float
# one example is below calculation :
0.2 + 0.1

0.30000000000000004

In [51]:
x = pd.Series(data=[1, 3, 2, 5], dtype='object')
x

0    1
1    3
2    2
3    5
dtype: object

In [52]:
y = pd.Series(data=[1, 3, 2, 5], dtype='int')
y

0    1
1    3
2    2
3    5
dtype: int64

In [None]:
# if int does not support sentinel, why don't we use objects instead of ints ?

In [54]:
# we can have mathematical calculation with objects
x + 1

0    2
1    4
2    3
3    6
dtype: object

In [63]:
# it's because of performance with int type, as it's processor native type and operation on them happens
# very quickly in comparison to objects which uses much more time and space to operate on.
# let's see an example for that :

In [57]:
x = pd.Series(data=range(10000), dtype='object')
x

0          0
1          1
2          2
3          3
4          4
        ... 
9995    9995
9996    9996
9997    9997
9998    9998
9999    9999
Length: 10000, dtype: object

In [58]:
y = pd.Series(data=range(10000), dtype='int')
y

0          0
1          1
2          2
3          3
4          4
        ... 
9995    9995
9996    9996
9997    9997
9998    9998
9999    9999
Length: 10000, dtype: int64

In [61]:
%timeit x.sum()

890 µs ± 20.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [62]:
%timeit y.sum()

62.4 µs ± 4.7 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [64]:
# as you see calculations on int data type done so much faster

In [65]:
# second aspect of missing values problem : 
# managing (placement and estimate) the missing values

In [66]:
names = ['hamid', 'sara', 'ali', 'reza', 'nahid', 'saeed']
heights = pd.Series(data=[173, 156, 161, None, 168, 180], index=names, dtype='float64')
weights = pd.Series(data=[76, None, 81, 78, 49, 80], index=names)

In [67]:
heights

hamid    173.0
sara     156.0
ali      161.0
reza       NaN
nahid    168.0
saeed    180.0
dtype: float64

In [68]:
weights

hamid    76.0
sara      NaN
ali      81.0
reza     78.0
nahid    49.0
saeed    80.0
dtype: float64

In [69]:
df = pd.DataFrame(data={'height': heights, 'weight': weights})
df

Unnamed: 0,height,weight
hamid,173.0,76.0
sara,156.0,
ali,161.0,81.0
reza,,78.0
nahid,168.0,49.0
saeed,180.0,80.0


In [70]:
weights / ((heights*0.01)**2)

hamid    25.393431
sara           NaN
ali      31.248794
reza           NaN
nahid    17.361111
saeed    24.691358
dtype: float64

In [76]:
heights.ffill() # forward fill (puts the value of item before NaN in place of NaN)
                # altough it's not a good solution here

hamid    173.0
sara     156.0
ali      161.0
reza     161.0
nahid    168.0
saeed    180.0
dtype: float64

In [78]:
heights.bfill() # backward fill (puts the value of item ahead of NaN in place of NaN)
                # altough it's not a good solution here

hamid    173.0
sara     156.0
ali      161.0
reza     168.0
nahid    168.0
saeed    180.0
dtype: float64

In [79]:
heights.fillna(170) # fills the missing values with provided value

hamid    173.0
sara     156.0
ali      161.0
reza     170.0
nahid    168.0
saeed    180.0
dtype: float64

In [82]:
df

Unnamed: 0,height,weight
hamid,173.0,76.0
sara,156.0,
ali,161.0,81.0
reza,,78.0
nahid,168.0,49.0
saeed,180.0,80.0


In [80]:
from sklearn.impute import KNNImputer

In [85]:
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(df)
# estimates and place the missing value from average value of it's K nearest neighbors which calculated
# based on euclidean distance.

array([[173. ,  76. ],
       [156. ,  65. ],
       [161. ,  81. ],
       [176.5,  78. ],
       [168. ,  49. ],
       [180. ,  80. ]])