In [1]:
import numpy as np
import pandas as pd

In [6]:
df = pd.DataFrame(
    {'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [10, 20, 30, 40]})

In [7]:
# Strategy 1: Leave it as missing
df

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,,20
2,,,30
3,4.0,8.0,40


In [8]:
# Strategy 2: Remove missing value
# drop by axis. Default axis=0 (rows)
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,10
3,4.0,8.0,40


In [9]:
# drop by axis. axis=1 (column)
df.dropna(axis=1)

Unnamed: 0,C
0,10
1,20
2,30
3,40


In [11]:
# setting up based on threshold
# all column satisfy the threshold
# max non-null values are more than 2
df.dropna(axis=1, thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,,20
2,,,30
3,4.0,8.0,40


In [12]:
# column B doesn't satisfy the threshold
df.dropna(axis=1, thresh=3)

Unnamed: 0,A,C
0,1.0,10
1,2.0,20
2,,30
3,4.0,40


In [13]:
# Strategy 3: Fill missing values
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1,5,10
1,2,FILL VALUE,20
2,FILL VALUE,FILL VALUE,30
3,4,8,40


In [14]:
df.fillna(value='0')

Unnamed: 0,A,B,C
0,1,5,10
1,2,0,20
2,0,0,30
3,4,8,40


In [16]:
# fill missing value, based on column
df['A'] = df['A'].fillna(value=0)

In [17]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,,20
2,0.0,,30
3,4.0,8.0,40


In [18]:
# fill missing value with mean
# noted. It is a clever strategy!
df['B'].mean()

6.5

In [19]:
df['B'].fillna(value=df['B'].mean())

0    5.0
1    6.5
2    6.5
3    8.0
Name: B, dtype: float64

In [20]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,,20
2,0.0,,30
3,4.0,8.0,40


In [21]:
df.fillna(df.mean())

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,6.5,20
2,0.0,6.5,30
3,4.0,8.0,40
