In [1]:
import numpy as np
import pandas as pd

df = pd.DataFrame([[100, 95, np.nan, 70,80], [83, 84, np.nan, 91,65], [88, np.nan, np.nan, 75,98], 
                   [82, 81, 93, 84,100],[np.nan, np.nan, np.nan, np.nan,60],[np.nan, np.nan, np.nan, np.nan,np.nan]],
                  index =['小夫','胖虎','小名','靜香','大熊','???'],
                  columns=['英文','國文','數學','社會','理化'])
df

Unnamed: 0,英文,國文,數學,社會,理化
小夫,100.0,95.0,,70.0,80.0
胖虎,83.0,84.0,,91.0,65.0
小名,88.0,,,75.0,98.0
靜香,82.0,81.0,93.0,84.0,100.0
大熊,,,,,60.0
???,,,,,


# 清除遺漏值

## 預設 dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [2]:
df.dropna() 

Unnamed: 0,英文,國文,數學,社會,理化
靜香,82.0,81.0,93.0,84.0,100.0


## 將所有都是na的列丟棄

In [3]:
df.dropna(how='all') 

Unnamed: 0,英文,國文,數學,社會,理化
小夫,100.0,95.0,,70.0,80.0
胖虎,83.0,84.0,,91.0,65.0
小名,88.0,,,75.0,98.0
靜香,82.0,81.0,93.0,84.0,100.0
大熊,,,,,60.0


## 只考慮選定欄

In [4]:
df.dropna(axis='index',how='any',subset=['英文','理化']) 

Unnamed: 0,英文,國文,數學,社會,理化
小夫,100.0,95.0,,70.0,80.0
胖虎,83.0,84.0,,91.0,65.0
小名,88.0,,,75.0,98.0
靜香,82.0,81.0,93.0,84.0,100.0


## 將大於4個na丟棄

In [5]:
df.dropna(thresh=4) 

Unnamed: 0,英文,國文,數學,社會,理化
小夫,100.0,95.0,,70.0,80.0
胖虎,83.0,84.0,,91.0,65.0
靜香,82.0,81.0,93.0,84.0,100.0


# 將缺漏值補值

In [6]:
df

Unnamed: 0,英文,國文,數學,社會,理化
小夫,100.0,95.0,,70.0,80.0
胖虎,83.0,84.0,,91.0,65.0
小名,88.0,,,75.0,98.0
靜香,82.0,81.0,93.0,84.0,100.0
大熊,,,,,60.0
???,,,,,


## 缺漏值補0

In [7]:
df.fillna(0) 

Unnamed: 0,英文,國文,數學,社會,理化
小夫,100.0,95.0,0.0,70.0,80.0
胖虎,83.0,84.0,0.0,91.0,65.0
小名,88.0,0.0,0.0,75.0,98.0
靜香,82.0,81.0,93.0,84.0,100.0
大熊,0.0,0.0,0.0,0.0,60.0
???,0.0,0.0,0.0,0.0,0.0


## 將缺漏補上平均值

In [8]:
df[['英文']].fillna(np.mean(df[['英文']])) 

Unnamed: 0,英文
小夫,100.0
胖虎,83.0
小名,88.0
靜香,82.0
大熊,88.25
???,88.25


In [9]:
df.fillna(np.mean(df)) 

Unnamed: 0,英文,國文,數學,社會,理化
小夫,100.0,95.0,93.0,70.0,80.0
胖虎,83.0,84.0,93.0,91.0,65.0
小名,88.0,86.666667,93.0,75.0,98.0
靜香,82.0,81.0,93.0,84.0,100.0
大熊,88.25,86.666667,93.0,80.0,60.0
???,88.25,86.666667,93.0,80.0,80.6
