## Miss data handling

In [43]:
import numpy as np
import pandas as pd
from sklearn import datasets
from io import StringIO

In [44]:
csv_data = '''A,B,C,D,E
            5.0,2.0,3.0,,6
            1.0,6.0,,8.0,5
            0.0,11.0,12.0,4.0,5
            3.0,,3.0,5.0,
            5.0,1.0,4.0,2.0,4
           '''

In [45]:
data = pd.read_csv(StringIO(csv_data))

In [46]:
data

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [48]:
data.dropna()  # dropna(how = 'all') 全部空值才會丟掉

Unnamed: 0,A,B,C,D,E
2,0.0,11.0,12.0,4.0,5.0
4,5.0,1.0,4.0,2.0,4.0


In [50]:
data.dropna(subset = ['D']) #指定欄位

Unnamed: 0,A,B,C,D,E
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [55]:
x = 5
data.fillna(x) # use x to fill na

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,5.0,6.0
1,1.0,6.0,5.0,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,5.0
4,5.0,1.0,4.0,2.0,4.0


In [58]:
data['B'] = data['B'].fillna(data['B'].mean()) #mean(),mode()眾數 ,median()中位數, min()
data

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


## Catrgorical Data

In [79]:
df2 = pd.DataFrame(
    [['green', 'M', 10.1, 1],
    ['red', 'L', 13.5, 2],
    ['blue', 'XL', 15.3, 1]]
)
df2.columns = ['color', 'size', 'price', 'classlabel']

In [80]:
size_mapping = {'M':1,"L":2,'XL':3}
df2['size'] = df2['size'].map(size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,2
2,blue,3,15.3,1


In [81]:
pd.get_dummies(df2['color'])

Unnamed: 0,blue,green,red
0,0,1,0
1,0,0,1
2,1,0,0


In [82]:
onehot_encoding = pd.get_dummies(df2['color'],prefix = 'color')
df2 = pd.concat([onehot_encoding,df2], axis = 1)
df2 = df2.drop('color',1)

In [83]:
df2

Unnamed: 0,color_blue,color_green,color_red,size,price,classlabel
0,0,1,0,1,10.1,1
1,0,0,1,2,13.5,2
2,1,0,0,3,15.3,1


## feature scaling

### normalization  0-1縮放

In [84]:
from IPython.display import Math

In [85]:
Math(r'x^{(i)}_{norm}=\frac{x^{(i)}-x_{min}}{x_{max}-x_{min}}')

<IPython.core.display.Math object>

In [88]:
iris = datasets.load_iris()
x = pd.DataFrame(iris['data'], columns=iris['feature_names'])
y = pd.DataFrame(iris['target'], columns= ['target_names'])
iris_data = pd.concat([x,y],axis = 1)
iris_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [90]:
iris_data['sepal length (cm)'] = (iris_data['sepal length (cm)'] - iris_data['sepal length (cm)'].min()) /(iris_data['sepal length (cm)'].max()-iris_data['sepal length (cm)'].min())

In [92]:
iris_data['sepal length (cm)'].head(3)

0    0.222222
1    0.166667
2    0.111111
Name: sepal length (cm), dtype: float64

### standization 
#### 不易受OUTLIER影響
mean = 0 std = 1

In [93]:
Math(r'x^{(i)}_{std}=\frac{x^{(i)}-\mu_{x}}{\sigma_{x}}')

<IPython.core.display.Math object>

In [94]:
iris_data['sepal width (cm)'] = (iris_data['sepal width (cm)'] - iris_data['sepal width (cm)'].mean()) / iris_data['sepal width (cm)'].std()

In [96]:
iris_data['sepal width (cm)'].head(3)

0    1.028611
1   -0.124540
2    0.336720
Name: sepal width (cm), dtype: float64