In [1]:
import pandas as pd
import numpy as np

## 处理缺失值

In [4]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)
print(string_data.isnull())
string_data[0] = None
print(string_data.isnull())

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
0    False
1    False
2     True
3    False
dtype: bool
0     True
1    False
2     True
3    False
dtype: bool


### 过滤缺失值

In [6]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
print(data)
print(data.dropna())
print(data[data.notnull()])

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64
0    1.0
2    3.5
4    7.0
dtype: float64
0    1.0
2    3.5
4    7.0
dtype: float64


In [8]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
print(data)
print(cleaned)
print(data.dropna(how='all'))
data[4] = NA
print(data)
print(data.dropna(axis=1, how='all'))

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0
     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [10]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
print(df)
print(df.dropna())
print(df.dropna(thresh=2)) # 至少有两个非NA值的行

          0         1         2
0  0.924823       NaN       NaN
1 -1.001581       NaN       NaN
2  0.070636       NaN  0.601011
3 -0.851477       NaN -0.443560
4 -0.302486 -0.883002  0.306087
5 -0.778317 -1.343997 -1.104546
6  2.009433 -1.070997 -0.467373
          0         1         2
4 -0.302486 -0.883002  0.306087
5 -0.778317 -1.343997 -1.104546
6  2.009433 -1.070997 -0.467373
          0         1         2
2  0.070636       NaN  0.601011
3 -0.851477       NaN -0.443560
4 -0.302486 -0.883002  0.306087
5 -0.778317 -1.343997 -1.104546
6  2.009433 -1.070997 -0.467373


### 补全缺失值

In [11]:
print(df.fillna(0)) # 用0填充NA值
print(df.fillna({1: 0.5, 2: 0})) # 用不同的值填充不同的列

          0         1         2
0  0.924823  0.000000  0.000000
1 -1.001581  0.000000  0.000000
2  0.070636  0.000000  0.601011
3 -0.851477  0.000000 -0.443560
4 -0.302486 -0.883002  0.306087
5 -0.778317 -1.343997 -1.104546
6  2.009433 -1.070997 -0.467373
          0         1         2
0  0.924823  0.500000  0.000000
1 -1.001581  0.500000  0.000000
2  0.070636  0.500000  0.601011
3 -0.851477  0.500000 -0.443560
4 -0.302486 -0.883002  0.306087
5 -0.778317 -1.343997 -1.104546
6  2.009433 -1.070997 -0.467373


In [12]:
_ = df.fillna(0, inplace=True)
print(df) # 原地修改

          0         1         2
0  0.924823  0.000000  0.000000
1 -1.001581  0.000000  0.000000
2  0.070636  0.000000  0.601011
3 -0.851477  0.000000 -0.443560
4 -0.302486 -0.883002  0.306087
5 -0.778317 -1.343997 -1.104546
6  2.009433 -1.070997 -0.467373


In [13]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
print(df)
print(df.fillna(method='ffill')) # 向前填充
print(df.fillna(method='ffill', limit=2)) # 最多填充两个
data = pd.Series([1., NA, 3.5, NA, 7])
print(data.fillna(data.mean())) # 用均值填充

          0         1         2
0 -0.153056 -0.053037  1.582065
1 -0.005950  0.908369  0.288584
2  0.666006       NaN  0.088508
3  0.946138       NaN -0.979656
4  0.409500       NaN       NaN
5  0.019259       NaN       NaN
          0         1         2
0 -0.153056 -0.053037  1.582065
1 -0.005950  0.908369  0.288584
2  0.666006  0.908369  0.088508
3  0.946138  0.908369 -0.979656
4  0.409500  0.908369 -0.979656
5  0.019259  0.908369 -0.979656
          0         1         2
0 -0.153056 -0.053037  1.582065
1 -0.005950  0.908369  0.288584
2  0.666006  0.908369  0.088508
3  0.946138  0.908369 -0.979656
4  0.409500       NaN -0.979656
5  0.019259       NaN -0.979656
0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64


  print(df.fillna(method='ffill')) # 向前填充
  print(df.fillna(method='ffill', limit=2)) # 最多填充两个
