## Handling Missing Data

In [1]:
import pandas as pd
import numpy as np

### Prepare the data frame

In [2]:
dates = pd.date_range("20170101", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2017-01-01,0,1,2,3
2017-01-02,4,5,6,7
2017-01-03,8,9,10,11
2017-01-04,12,13,14,15
2017-01-05,16,17,18,19
2017-01-06,20,21,22,23


### Set some NaN data

In [3]:
df.iloc[2, 2] = np.nan
df.iloc[5, 3] = np.nan
df

Unnamed: 0,A,B,C,D
2017-01-01,0,1,2.0,3.0
2017-01-02,4,5,6.0,7.0
2017-01-03,8,9,,11.0
2017-01-04,12,13,14.0,15.0
2017-01-05,16,17,18.0,19.0
2017-01-06,20,21,22.0,


### Drop rows which has the NaNs 
- Drop any row that has NaN : df.dropna( axis = 0 , how = 'any' )
- Drop any row that all rows are NaN : df.dropna( axis = 0 , how = 'all' )

In [4]:
df1 = df.dropna(axis=0, how='any')
df1

Unnamed: 0,A,B,C,D
2017-01-01,0,1,2.0,3.0
2017-01-02,4,5,6.0,7.0
2017-01-04,12,13,14.0,15.0
2017-01-05,16,17,18.0,19.0


In [5]:
df2 = df.dropna(axis=0, how='all')
df2

Unnamed: 0,A,B,C,D
2017-01-01,0,1,2.0,3.0
2017-01-02,4,5,6.0,7.0
2017-01-03,8,9,,11.0
2017-01-04,12,13,14.0,15.0
2017-01-05,16,17,18.0,19.0
2017-01-06,20,21,22.0,


### Fill the NaN with value 
- df.fillna( value = fill_value ) 

In [6]:
df3 = df.fillna(value=0)
df3

Unnamed: 0,A,B,C,D
2017-01-01,0,1,2.0,3.0
2017-01-02,4,5,6.0,7.0
2017-01-03,8,9,0.0,11.0
2017-01-04,12,13,14.0,15.0
2017-01-05,16,17,18.0,19.0
2017-01-06,20,21,22.0,0.0


### Find in data frame if exists Data Frame
- df.isnull()

In [7]:
df.isnull()

Unnamed: 0,A,B,C,D
2017-01-01,False,False,False,False
2017-01-02,False,False,False,False
2017-01-03,False,False,True,False
2017-01-04,False,False,False,False
2017-01-05,False,False,False,False
2017-01-06,False,False,False,True


In [8]:
# Use numpy to detect if exists true
np.any(df.isnull()) == True

True