In [1]:
import pandas as pd
import numpy as np

In [3]:
my_df = pd.DataFrame({"A" : [1,2,4,np.nan,5,np.nan,7],
                      "B" : [4,np.nan,7,np.nan,1,np.nan,2]})

In [4]:
#Finding missing values with pandas
#True - data is missing, False - is not missing
my_df.isna()

Unnamed: 0,A,B
0,False,False
1,False,True
2,False,False
3,True,True
4,False,False
5,True,True
6,False,False


In [5]:
#Finding missing values with pandas
#Counts of # of Trues - Missing Values
my_df.isna().sum()


A    2
B    3
dtype: int64

In [6]:
#Dropping missing values with Pandas
my_df.dropna()

Unnamed: 0,A,B
0,1.0,4.0
2,4.0,7.0
4,5.0,1.0
6,7.0,2.0


In [7]:
#Can use how parameter
#Any - where any value is missing the row is missing
#All - Only drop if every value is missing

my_df.dropna(how = "any")

Unnamed: 0,A,B
0,1.0,4.0
2,4.0,7.0
4,5.0,1.0
6,7.0,2.0


In [8]:
#Rows with any missing values have been dropped - similar to previous run
#Default is any

In [10]:
my_df.dropna(how = "all")

Unnamed: 0,A,B
0,1.0,4.0
1,2.0,
2,4.0,7.0
4,5.0,1.0
6,7.0,2.0


In [11]:
#Only rows dropped with all missing avlues

In [12]:
my_df.dropna(how = "any", subset = ["A"]) #Only look at columns listed here - only A here

Unnamed: 0,A,B
0,1.0,4.0
1,2.0,
2,4.0,7.0
4,5.0,1.0
6,7.0,2.0


In [14]:
#Include inplace=True if we want to commit changes to data frame
my_df.dropna(how = "any", inplace=True) 

In [15]:
my_df #Now changes are committed here

Unnamed: 0,A,B
0,1.0,4.0
2,4.0,7.0
4,5.0,1.0
6,7.0,2.0


In [16]:
#Filling missing values witih pandas
#Recommend dropping rows rather than imputing
#But if we don't have a lot of data, this can be useful
my_df = pd.DataFrame({"A" : [1,2,4,np.nan,5,np.nan,7],
                      "B" : [4,np.nan,7,np.nan,1,np.nan,2]})

In [17]:
my_df.fillna(value = 100)
#For any missing values, they have been informed with 100

Unnamed: 0,A,B
0,1.0,4.0
1,2.0,100.0
2,4.0,7.0
3,100.0,100.0
4,5.0,1.0
5,100.0,100.0
6,7.0,2.0


In [18]:
#We need a mean value for column A
mean_value = my_df["A"].mean()

In [19]:
mean_value

3.8

In [20]:
my_df["A"].fillna(value = mean_value)

0    1.0
1    2.0
2    4.0
3    3.8
4    5.0
5    3.8
6    7.0
Name: A, dtype: float64

In [21]:
#Now 3.8 has been populated for missing rows

In [22]:
#We need commit this to the entire data set
my_df.fillna(value = my_df.mean(), inplace = True)

In [23]:
my_df

Unnamed: 0,A,B
0,1.0,4.0
1,2.0,3.5
2,4.0,7.0
3,3.8,3.5
4,5.0,1.0
5,3.8,3.5
6,7.0,2.0


In [None]:
#Values in column A that are missing are 3.8, for B, 3.5