# 过滤缺失值

dropna()

## 1. 过滤缺失值(Series)

In [2]:
import numpy as np
import pandas as pd

In [2]:
from numpy import nan as NA

In [3]:
data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [6]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

## 2. 过滤缺失值(DataFrame)

In [10]:
df = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                 [NA, NA, NA], [NA, 6.5, 3.]])

In [11]:
df 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
cleaned = df.dropna()

In [13]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [14]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [18]:
df[4]=NA

In [19]:
df

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [21]:
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


## 过滤thresh

In [24]:
df2 = pd.DataFrame(np.random.randn(7, 3))

In [27]:
df2.iloc[:4, 1] = NA

In [28]:
df2.iloc[:2, 2] = NA

In [30]:
df2.iloc[:1, 0] = NA

In [35]:
df2

Unnamed: 0,0,1,2
0,,,
1,-1.431415,,
2,-0.490505,,-0.972433
3,-0.010548,,-0.851187
4,0.740992,0.507384,-0.312183
5,-1.043333,-0.404996,0.870679
6,-0.392122,0.139708,0.877711


In [32]:
df2.dropna()

Unnamed: 0,0,1,2
4,0.740992,0.507384,-0.312183
5,-1.043333,-0.404996,0.870679
6,-0.392122,0.139708,0.877711


In [34]:
df2.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.490505,,-0.972433
3,-0.010548,,-0.851187
4,0.740992,0.507384,-0.312183
5,-1.043333,-0.404996,0.870679
6,-0.392122,0.139708,0.877711


In [36]:
df2.dropna(thresh=3)

Unnamed: 0,0,1,2
4,0.740992,0.507384,-0.312183
5,-1.043333,-0.404996,0.870679
6,-0.392122,0.139708,0.877711


In [37]:
df2.dropna(thresh=1)

Unnamed: 0,0,1,2
1,-1.431415,,
2,-0.490505,,-0.972433
3,-0.010548,,-0.851187
4,0.740992,0.507384,-0.312183
5,-1.043333,-0.404996,0.870679
6,-0.392122,0.139708,0.877711


<br>

## 过滤异常值

In [3]:
data2 = pd.DataFrame(np.random.randn(1000, 4)) # 1000 * 4

In [5]:
data2.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.023257,-0.004367,-0.048793,0.048669
std,1.002324,0.975056,0.962531,0.998814
min,-2.975897,-2.515078,-2.77894,-2.767317
25%,-0.600241,-0.687939,-0.72862,-0.638818
50%,0.01782,-0.064037,-0.023435,0.037827
75%,0.695162,0.695309,0.583707,0.735671
max,3.285162,3.467619,3.495247,3.30591


## 找出一列中绝对值大于三的值

In [10]:
col = data2[0]
col[np.abs(col)>3]

247    3.285162
497    3.087342
Name: 0, dtype: float64

## 找出所有值大于3 或 小于-3的 行

In [13]:
data2[(np.abs(data2) > 3).any(1)]

Unnamed: 0,0,1,2,3
108,0.46818,3.147153,-0.188581,-1.270411
247,3.285162,-0.434279,-1.695953,-0.258822
409,-1.27941,0.537989,0.483499,3.088037
459,1.190925,3.467619,1.739601,-1.108778
463,-1.730079,-1.471566,3.495247,2.892337
497,3.087342,-0.297694,0.496733,1.663982
757,-0.614587,-1.904853,0.57104,3.30591
923,-1.062985,3.001474,0.308118,0.998864


## 把绝对值大于3 的数 ----> 转成3|-3

In [17]:
data2[(np.abs(data2) > 3)] = np.sign(data2) * 3

In [18]:
data2.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.022885,-0.004983,-0.049288,0.048275
std,1.001172,0.97303,0.960831,0.997597
min,-2.975897,-2.515078,-2.77894,-2.767317
25%,-0.600241,-0.687939,-0.72862,-0.638818
50%,0.01782,-0.064037,-0.023435,0.037827
75%,0.695162,0.695309,0.583707,0.735671
max,3.0,3.0,3.0,3.0


In [19]:
np.sign(data2).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,-1.0
1,1.0,-1.0,1.0,1.0
2,1.0,1.0,1.0,-1.0
3,1.0,1.0,1.0,-1.0
4,-1.0,1.0,1.0,1.0


In [20]:
data2.head()

Unnamed: 0,0,1,2,3
0,0.757615,0.72609,0.593284,-0.18637
1,1.003538,-0.758928,0.702394,0.109721
2,1.468725,0.2153,0.018005,-0.182333
3,0.575547,1.176035,0.091834,-0.577326
4,-0.656265,0.227388,0.183606,1.629515


In [25]:
data2[(np.abs(data2) == 3)].iloc[923]

0    NaN
1    3.0
2    NaN
3    NaN
Name: 923, dtype: float64