In [1]:
# -*- coding:utf-8 -*- #

### 6.2.1  缺失值處理

In [2]:
import pandas as pd
import numpy as np

dic = {   
     'state': ['Ohio', 'Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
     'year': [2000, 2000, 2001, 2002, 2003, 3456],
     'score': [1.5, 1.5, 1.7, np.nan, np.nan, 8.3],
     'desc': [np.nan, np.nan, np.nan, np.nan, np.nan, 3],
     'val1': [1, 1, 0, '+', 0, 1],
}
data = pd.DataFrame(dic)

print(data['desc'].nunique()) # 不同取值個數
print(data['desc'].unique()) # 不同取值列表
print(data['year'].value_counts()) # 不同取值出現次數

1
[nan  3.]
2000    2
3456    1
2003    1
2002    1
2001    1
Name: year, dtype: int64


In [3]:
print(data['desc'].isnull()) # 是否缺失
print(data['desc'].isnull().any()) # 是否含有任意缺失
print(data['desc'].isnull().all()) # 是否全部缺失
print(data['desc'].isnull().sum(), len(data)) # 空值個數與記錄個數
print(data.dropna(axis=1, how='all'))
print(data['score'].fillna(data['score'].mean()))
print(data['score'].fillna(method='ffill', limit=1))

print(data.interpolate(mdthod='polynomial', order=2)) # 二次多項式插值
print(data.interpolate(mdthod='spline', order=3)) # 三次樣條插值

0     True
1     True
2     True
3     True
4     True
5    False
Name: desc, dtype: bool
True
False
5 6
    state  year  score  desc val1
0    Ohio  2000    1.5   NaN    1
1    Ohio  2000    1.5   NaN    1
2    Ohio  2001    1.7   NaN    0
3    Ohio  2002    NaN   NaN    +
4  Nevada  2003    NaN   NaN    0
5  Nevada  3456    8.3   3.0    1
0    1.50
1    1.50
2    1.70
3    3.25
4    3.25
5    8.30
Name: score, dtype: float64
0    1.5
1    1.5
2    1.7
3    1.7
4    NaN
5    8.3
Name: score, dtype: float64
    state  year  score  desc val1
0    Ohio  2000    1.5   NaN    1
1    Ohio  2000    1.5   NaN    1
2    Ohio  2001    1.7   NaN    0
3    Ohio  2002    3.9   NaN    +
4  Nevada  2003    6.1   NaN    0
5  Nevada  3456    8.3   3.0    1
    state  year  score  desc val1
0    Ohio  2000    1.5   NaN    1
1    Ohio  2000    1.5   NaN    1
2    Ohio  2001    1.7   NaN    0
3    Ohio  2002    3.9   NaN    +
4  Nevada  2003    6.1   NaN    0
5  Nevada  3456    8.3   3.0    1


In [4]:
from sklearn.preprocessing import Imputer
imp =Imputer(missing_values="NaN", strategy="most_frequent",axis=0 )
data["score"]=imp.fit_transform(data[["score"]])

print(data['score'].fillna(-1))

0    1.5
1    1.5
2    1.7
3    1.5
4    1.5
5    8.3
Name: score, dtype: float64




### 6.2.2  異常值處理

In [5]:
print(data.query('year<2050'))
print(data[data['year']<2050])

data['val1'] = data['val1'].apply(lambda x: 1 if x == '+' else x)

    state  year  score  desc val1
0    Ohio  2000    1.5   NaN    1
1    Ohio  2000    1.5   NaN    1
2    Ohio  2001    1.7   NaN    0
3    Ohio  2002    1.5   NaN    +
4  Nevada  2003    1.5   NaN    0
    state  year  score  desc val1
0    Ohio  2000    1.5   NaN    1
1    Ohio  2000    1.5   NaN    1
2    Ohio  2001    1.7   NaN    0
3    Ohio  2002    1.5   NaN    +
4  Nevada  2003    1.5   NaN    0


### 6.2.3  去重處理

In [6]:
print(data.drop_duplicates(keep='last'))
print(data.drop_duplicates(keep='last', subset='year'))

    state  year  score  desc  val1
1    Ohio  2000    1.5   NaN     1
2    Ohio  2001    1.7   NaN     0
3    Ohio  2002    1.5   NaN     1
4  Nevada  2003    1.5   NaN     0
5  Nevada  3456    8.3   3.0     1
    state  year  score  desc  val1
1    Ohio  2000    1.5   NaN     1
2    Ohio  2001    1.7   NaN     0
3    Ohio  2002    1.5   NaN     1
4  Nevada  2003    1.5   NaN     0
5  Nevada  3456    8.3   3.0     1
