In [1]:
import numpy as np
import pandas as pd

None: Pythonic missing data

In [2]:
vals1 = np.array([1,None, 3,4])
vals1

array([1, None, 3, 4], dtype=object)

In [3]:
for dtype in ['object','int']:
    print('dtype =', dtype)
    %timeit np.arange(1E6, dtype = dtype).sum()
    print()

dtype = object
82.6 ms ± 3.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype = int
2.96 ms ± 285 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)



In [4]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [5]:
vals1.min()

TypeError: '<=' not supported between instances of 'int' and 'NoneType'

NaN: Missing numerical data

In [7]:
vals2 = np.array([1, np.nan, 3,4])
vals2.dtype

dtype('float64')

In [8]:
1+np.nan

nan

In [9]:
0*np.nan

nan

In [10]:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [12]:
np.nansum(vals2),np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

NaN and None in Pandas

In [13]:
pd.Series([1,np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [14]:
x = pd.Series(range(2), dtype = int)
x

0    0
1    1
dtype: int32

In [15]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

Typeclass	   Conversion When Storing NAs	       NA Sentinel Value
floating	            No change	                   np.nan
object	                 No change	                None or np.nan
integer	             Cast to float64	               np.nan
boolean	            Cast to object	                None or np.nan

Operating on Null Values

Detecting null values

In [16]:
data = pd.Series([1, np.nan, 'hello', None])

In [17]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [21]:
data[data.notnull()]

0        1
2    hello
dtype: object

Dropping null values

In [22]:
data.dropna()

0        1
2    hello
dtype: object

In [23]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [24]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [29]:
df.dropna(axis=0)

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [30]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [31]:
df.dropna(axis='rows')

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [32]:
df[3] = np.nan

In [33]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [34]:
df.dropna(axis='columns', how = 'all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [43]:
df.dropna(axis='rows', thresh = 3)
#Here the first and last row have been dropped, 
#because they contain only two non-null values.



Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


Filling null values

In [44]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [45]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [46]:
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [47]:
#forward fill
data.fillna(method = 'ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [48]:
#back fill
data.fillna(method = 'bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [49]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [50]:
df.fillna(method = 'ffill',axis = 1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [51]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [52]:
df.fillna(method = 'ffill',axis = 0)

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,
