### 处理缺失值

### 缺失值的表示形式
#### None: Python 的表示方法

In [10]:
import numpy as np
import pandas as pd
vsls1 = np.array([1,None,3,4,5])
vsls1

array([1, None, 3, 4, 5], dtype=object)

### NaN

In [20]:
vsls2 = np.array([1,2,3,np.nan,4])
vsls2

array([ 1.,  2.,  3., nan,  4.])

In [21]:
vsls2.dtype

dtype('float64')

In [22]:
vsls2.sum()

nan

In [25]:
1 + np.nan + 2

nan

In [26]:
np.sum(vsls2)

nan

In [31]:
np.nansum(vsls2)

10.0

### Pandas 中的NaN 和None

In [32]:
pd.Series([1,np.nan,2,None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [37]:
x = pd.Series(range(5), dtype = int)
x

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [41]:
x[0] = np.nan

In [42]:
x

0    NaN
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [44]:
x[0] = None
x

0    NaN
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

### 缺失值的操作

### 判断缺失值: isnull 

In [52]:
data = pd.Series([1,np.nan, 'Hello',None])
data.isnull() # None = np.nan 是空

0    False
1     True
2    False
3     True
dtype: bool

In [53]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [54]:
data[data.notnull()]

0        1
2    Hello
dtype: object

In [55]:
data[data.isnull()]

1     NaN
3    None
dtype: object

## 忽略缺失值: dropna()

In [58]:
data.dropna()

0        1
2    Hello
dtype: object

In [73]:
df = pd.DataFrame([[1,2,np.nan],
                 [3,4,'HELLO',"Workd"]])

In [74]:
df

Unnamed: 0,0,1,2,3
0,1,2,,
1,3,4,HELLO,Workd


### dropna - 默认去掉含有Nan/None 的行

In [75]:
df.dropna()

Unnamed: 0,0,1,2,3
1,3,4,HELLO,Workd


### drop(axis = columns去掉列Nan/None)

In [77]:
df.dropna(axis = 'columns')

Unnamed: 0,0,1
0,1,2
1,3,4


In [83]:
df[4] = np.nan 
df

Unnamed: 0,0,1,2,3,4
0,1,2,,,
1,3,4,HELLO,Workd,


In [84]:
df.dropna(axis = 'columns',how = 'all') # how = all 表示只删除全部为'Nan'的列

Unnamed: 0,0,1,2,3
0,1,2,,
1,3,4,HELLO,Workd


In [85]:
df.dropna(axis = 'columns',how  = 'any') # how 默认value is any,表示任何含有"Nan"的列都会被removed

Unnamed: 0,0,1
0,1,2
1,3,4


## 填补缺失值 - fillna()

In [88]:
data = pd.Series( [1,np.nan,2,None,3], index = list('abcde') )
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [89]:
data.fillna("New")

a      1
b    New
c      2
d    New
e      3
dtype: object

In [91]:
data.fillna(110)

a      1.0
b    110.0
c      2.0
d    110.0
e      3.0
dtype: float64

In [93]:
data.fillna(method = 'ffill')# methord forward fill -利用前面的数值来填充

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [94]:
data.fillna(method = 'bfill')  # method = backword fill 

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [95]:
df

Unnamed: 0,0,1,2,3,4
0,1,2,,,
1,3,4,HELLO,Workd,


In [97]:
df.fillna(method = 'ffill', axis = 1)

Unnamed: 0,0,1,2,3,4
0,1,2,2,2,2
1,3,4,HELLO,Workd,Workd


In [100]:
df.fillna(method = 'bfill', axis = 1)

Unnamed: 0,0,1,2,3,4
0,1,2,,,
1,3,4,HELLO,Workd,


In [106]:
df[4] = 1000
df

Unnamed: 0,0,1,2,3,4
0,1,2,,,1000
1,3,4,HELLO,Workd,1000


In [107]:
df.fillna(method = 'bfill', axis = 1) # 用1000 将None and Nan 替换的

Unnamed: 0,0,1,2,3,4
0,1,2,1000,1000,1000
1,3,4,HELLO,Workd,1000
