# 建立一個有缺失值的DataFrame

In [1]:
import pandas as pd
import numpy as np

In [17]:
df = pd.DataFrame([\
['frank', 'M', np.nan], \
['mary', np.nan, np.nan], \
['tom' , 'M', 35], \
['ted' , 'M', 33], \
['jean' , np.nan, 21], \
['lisa', 'F', 20]])
df.columns = ['name', 'gender', 'age']
df

Unnamed: 0,name,gender,age
0,frank,M,
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


## 檢察缺失值

In [4]:
df["name"].isnull().sum()

0

In [5]:
df["name"].notnull().sum()

6

In [7]:
df.gender.isnull().values.any() #檢查是否還有缺失值

True

# 處理缺失值

### 所有缺失值補上"-"

In [8]:
df.fillna("-")

Unnamed: 0,name,gender,age
0,frank,M,-
1,mary,-,-
2,tom,M,35.0
3,ted,M,33.0
4,jean,-,21.0
5,lisa,F,20.0


### 捨棄缺失值(占比很低時)

In [10]:
#有缺失值的資料都丟掉
df.dropna()

Unnamed: 0,name,gender,age
2,tom,M,35.0
3,ted,M,33.0
5,lisa,F,20.0


In [11]:
#捨棄所有欄位都有缺失值的行

In [12]:
df.dropna(how="all")

Unnamed: 0,name,gender,age
0,frank,M,
1,mary,,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [13]:
#捨棄超過兩欄有缺失值的
df.dropna(thresh=2)

Unnamed: 0,name,gender,age
0,frank,M,
2,tom,M,35.0
3,ted,M,33.0
4,jean,,21.0
5,lisa,F,20.0


In [19]:
df["employee"] = np.nan
df

Unnamed: 0,name,gender,age,employee
0,frank,M,,
1,mary,,,
2,tom,M,35.0,
3,ted,M,33.0,
4,jean,,21.0,
5,lisa,F,20.0,


# 利用統計填補缺失值

In [20]:
#以0填補
df["age"].fillna(0)

0     0.0
1     0.0
2    35.0
3    33.0
4    21.0
5    20.0
Name: age, dtype: float64

In [21]:
#以平均數填補
df["age"].fillna(df["age"].mean())

0    27.25
1    27.25
2    35.00
3    33.00
4    21.00
5    20.00
Name: age, dtype: float64

### 以分組平均來填補缺失值

In [27]:
#依據性別分組計"平均年齡"
df.groupby("gender")["age"].mean()


gender
F    20.0
M    34.0
Name: age, dtype: float64

In [30]:
df["age"].fillna(df.groupby("gender")["age"].transform("mean"))   
df

Unnamed: 0,name,gender,age,employee
0,frank,M,,
1,mary,,,
2,tom,M,35.0,
3,ted,M,33.0,
4,jean,,21.0,
5,lisa,F,20.0,


## 如果數據有趨勢，使用內插法

In [34]:
#.interpolate()
df2 = pd.DataFrame([[1, 870],\
[2, 900],\
[np.nan, np.nan],\
[4, 950],\
[5,1080],\
[6,1200]])
df2.columns = ['time','val']
df2

Unnamed: 0,time,val
0,1.0,870.0
1,2.0,900.0
2,,
3,4.0,950.0
4,5.0,1080.0
5,6.0,1200.0


In [33]:
df2.interpolate()


Unnamed: 0,time,val
0,1.0,870.0
1,2.0,900.0
2,3.0,925.0
3,4.0,950.0
4,5.0,1080.0
5,6.0,1200.0
