<a href="https://colab.research.google.com/github/aluqbnle/ml-sandbox/blob/master/12_treatment_of_missing_value.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy
import pandas

In [0]:
data = {'name': ['Ryo', 'Kaori', 'Hideyuki', 'Hayato', 'Miki', 'Saeko'],  # 名前
        'gender': ['M', 'F', 'M', 'M', 'F', 'F'],  # 性別
        'height': [186, 168, 175, 210, 160, 163],  # 身長
        'weight': [72, 47, 62, 90, None, numpy.NaN],  # 体重
        'age': [30, 20, None , numpy.NaN, 23, 25],  # 年齢
        'size': ['L', 'M', 'L', 'XL', None, 'S']  # 服のサイズ
        }

In [3]:
df = pandas.DataFrame(data, columns=['name', 'gender', 'age', 'height', 'weight', 'size'])
df

Unnamed: 0,name,gender,age,height,weight,size
0,Ryo,M,30.0,186,72.0,L
1,Kaori,F,20.0,168,47.0,M
2,Hideyuki,M,,175,62.0,L
3,Hayato,M,,210,90.0,XL
4,Miki,F,23.0,160,,
5,Saeko,F,25.0,163,,S


In [4]:
df.dtypes

name       object
gender     object
age       float64
height      int64
weight    float64
size       object
dtype: object

In [5]:
df.isnull()

Unnamed: 0,name,gender,age,height,weight,size
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,True,False,False,False
4,False,False,False,False,True,True
5,False,False,False,False,True,False


In [6]:
df.isnull().sum()

name      0
gender    0
age       2
height    0
weight    2
size      1
dtype: int64

In [7]:
df.dropna()

Unnamed: 0,name,gender,age,height,weight,size
0,Ryo,M,30.0,186,72.0,L
1,Kaori,F,20.0,168,47.0,M


In [8]:
df.dropna(axis=1)

Unnamed: 0,name,gender,height
0,Ryo,M,186
1,Kaori,F,168
2,Hideyuki,M,175
3,Hayato,M,210
4,Miki,F,160
5,Saeko,F,163


In [0]:
from sklearn.preprocessing import Imputer

In [10]:
imp_num = Imputer(missing_values='NaN', strategy='mean', axis=0)



In [11]:
# インデックス 2 と 4 の列を numpy.array として取り出す
df.values[:, [2, 4]]

array([[30.0, 72.0],
       [20.0, 47.0],
       [nan, 62.0],
       [nan, 90.0],
       [23.0, nan],
       [25.0, nan]], dtype=object)

In [12]:
imputed_data = imp_num.fit_transform(df.values[:, [2, 4]])
imputed_data

array([[30.  , 72.  ],
       [20.  , 47.  ],
       [24.5 , 62.  ],
       [24.5 , 90.  ],
       [23.  , 67.75],
       [25.  , 67.75]])

In [13]:
# size2int は size to int の意味
size2int = {'S': 1, 'M': 2, 'L': 3, 'XL': 4}

df['size'] = df['size'].map(size2int)
df

Unnamed: 0,name,gender,age,height,weight,size
0,Ryo,M,30.0,186,72.0,3.0
1,Kaori,F,20.0,168,47.0,2.0
2,Hideyuki,M,,175,62.0,3.0
3,Hayato,M,,210,90.0,4.0
4,Miki,F,23.0,160,,
5,Saeko,F,25.0,163,,1.0


In [14]:
imp_cat = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)



In [15]:
imp_cat.fit_transform(df.values[:, [5]])

array([[3.],
       [2.],
       [3.],
       [4.],
       [3.],
       [1.]])

In [16]:
# int2size は int to size の意味
int2size = {v: k for k, v in size2int.items()}

df['size'] = df['size'].map(int2size)
df

Unnamed: 0,name,gender,age,height,weight,size
0,Ryo,M,30.0,186,72.0,L
1,Kaori,F,20.0,168,47.0,M
2,Hideyuki,M,,175,62.0,L
3,Hayato,M,,210,90.0,XL
4,Miki,F,23.0,160,,
5,Saeko,F,25.0,163,,S


In [0]:
df['size'] = pandas.Categorical(df['size'], 
                                                 categories=['S', 'M', 'L', 'XL'],
                                                 ordered=True)

In [18]:
pandas.get_dummies(df, columns=['size'], dummy_na=True)

Unnamed: 0,name,gender,age,height,weight,size_S,size_M,size_L,size_XL,size_nan
0,Ryo,M,30.0,186,72.0,0,0,1,0,0
1,Kaori,F,20.0,168,47.0,0,1,0,0,0
2,Hideyuki,M,,175,62.0,0,0,1,0,0
3,Hayato,M,,210,90.0,0,0,0,1,0
4,Miki,F,23.0,160,,0,0,0,0,1
5,Saeko,F,25.0,163,,1,0,0,0,0
