In [1]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

In [3]:
#Create a dataset with some missing data
missing_data = Series([1,2, np.nan,4])
missing_data

0    1.0
1    2.0
2    NaN
3    4.0
dtype: float64

In [5]:
#Find which values are null
missing_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
#Deal with missing data - way 1
#Drop the row with NA
missing_data.dropna()

0    1.0
1    2.0
3    4.0
dtype: float64

In [8]:
# Create a dataframe with missing data 
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [9]:
rowdrop_dframe = dframe.dropna()
rowdrop_dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In the above case all rows which had an NA occured was dropped

In [10]:
#Only drop rows that have NA in all columns
dframe.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0


In [11]:
#Change the axis to drop all columns with missing data
#in the case of dframe all columns will be dropped
dframe.dropna(axis=1)

0
1
2
3


In [12]:
#Drop rows based on threshold eg. drop rows with less than x data points
dframe2 = DataFrame([[1,2,3,np.nan],[2,np.nan,5,6],[np.nan,7,np.nan,9],[1,np.nan,np.nan,np.nan]])
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [14]:
#Droppin any rows that dont have at least 3 data points
dframe2.dropna(thresh=3)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0


In [16]:
#Droppin any columns that dont have at least 3 data points
dframe2.dropna(axis=1,thresh=3)

Unnamed: 0,0
0,1.0
1,2.0
2,
3,1.0


In [17]:
#Supply NaN values in dataframe with default values
dframe2.fillna(1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,1.0
1,2.0,1.0,5.0,6.0
2,1.0,7.0,1.0,9.0
3,1.0,1.0,1.0,1.0


In [18]:
#Can also fill in diff values for diff columns
dframe2.fillna({0:0,1:1,2:2,3:3})

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,3.0
1,2.0,1.0,5.0,6.0
2,0.0,7.0,2.0,9.0
3,1.0,1.0,2.0,3.0


One thing to note is that the original data frame on which the fill, drop na operations are carried are not actually affected unless we pass in the in_place parameter.

In [20]:
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [21]:
#If we want to modify the exsisting object, use inplace
dframe2.fillna(0,inplace=True)
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,0.0
1,2.0,0.0,5.0,6.0
2,0.0,7.0,0.0,9.0
3,1.0,0.0,0.0,0.0
