<a href="https://colab.research.google.com/github/brew-brew-com/ML-Prep/blob/main/31_Preprocessing_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing：one-hot エンコーディング・欠損値処理

## データの意味づけ、不要な特徴量の削除、one-hot encoding、欠損値補完>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#ローンデータのダンプ

import pandas as pd

loan_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/input_data/loan_data.csv")
display(loan_data.head(5))

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
#ローンデータのカラムの意味づけ

X_01 = loan_data.iloc[:, :-1]    # 最終列以前を特徴量Xとする
y_01 = loan_data.iloc[:, [-1]]   # 最終列を正解データyとする

X_02 = pd.DataFrame(X_01)
y_02 = pd.DataFrame(y_01)

display(X_02.join(y_02).head(5))

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
#無用な特徴量の削除

X_03 = X_02.drop('Loan_ID', axis=1) # 1列目はID情報のため特徴量から削除
y_03 = y_02.copy()

display(X_03.join(y_03).head(5))

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
# ローン審査でNOとなったサンプルを1（正例）へ変換

class_mapping = {'N':1, 'Y':0}

X_04 = X_03.copy()
y_04 = y_03.copy()
y_04.loc[:,'Loan_Status'] = y_04['Loan_Status'].map(class_mapping)

display(X_04.join(y_04).head(5))

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,0
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,1
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,0
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,0
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,0


GenderやMarriedのようなカテゴリ変数を0/1のバイナリ変数に変換することが<b>one-hotエンコーディング</b>、LoanAmountの1行目のような欠損値を壁認知で置き換えるのが<b>欠損値補完</b>。

In [6]:
#one-hotエンコーディング。欠損地置き換えをやっている。

ohe_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

X_05 = pd.get_dummies(X_04, dummy_na=True, columns=ohe_cols)
y_05 = y_04.copy()

display(X_05.join(y_05).head(5))

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan,Loan_Status
0,5849,0.0,,360.0,1.0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0
1,4583,1508.0,128.0,360.0,1.0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1
2,3000,0.0,66.0,360.0,1.0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
3,2583,2358.0,120.0,360.0,1.0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0
4,6000,0.0,141.0,360.0,1.0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0


それでは連続変数の欠損値の平均値補完の実行です。preporcessingクラスからImputerを読み込みます。Imputerクラスのメソッドtransfomrを適用することで、LoanAmountの欠損値（1行目など）を、NaNから平均値（146.412162）に置き換えることができます。

In [7]:
# from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
import numpy as np

# 欠損値NaNを平均値(mean)で置き換える
# パラメーターは全部デフォルト
# strategy='mean'は「平均値で置き換える」
# axis=0は列平均をとる。axis=1は行平均をとる

# New in version 0.20: SimpleImputer replaces the previous sklearn.preprocessing.Imputer estimator which is now removed.
# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None)
imp.fit(X_05)

# 学習済みのImputerを適用し、欠損値を置き換える.

X_05_cols = X_05.columns.values
# transformが「置き換える」。transformはnumpi arrayで返ってしまうため、もう一度データフレーム化
X_06 = pd.DataFrame(imp.transform(X_05), columns=X_05_cols)
y_06 = y_05.copy()

display(X_06.join(y_06).head(5))

print('X shape: (%i, %i)' %X_06.shape)
print('y shape: (%i, %i)' %y_06.shape)
print()
print(y_06.groupby(['Loan_Status']).size())

# 0レコード目のLoanAmauntが平均値で置き換わっていることに注目
# 26次元に拡張された

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan,Loan_Status
0,5849.0,0.0,146.412162,360.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1
2,3000.0,0.0,66.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,6000.0,0.0,141.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0


X shape: (614, 26)
y shape: (614, 1)

Loan_Status
0    422
1    192
dtype: int64


# Preprocessing：次元圧縮（RFE&PCA)

## 後ほど記述（2018/07/07）

In [8]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

# 特徴量因子の重要度を推定する分類器をRandomForestClassifierに設定
# 最終的に残す特徴量を10に設定
# 1回のstepで削除する次元数は5%ずつとする

# RFECVはn_features_to_select=10が自動
# selector = RFECV(estimator=RandomForestClassifier(random_state=0), step=0.05)

selector = RFE(estimator=RandomForestClassifier(random_state=0), n_features_to_select=10, step=0.05)
# selector.fit(X_06, y_06.as_matrix().ravel())
selector.fit(X_06, y_06)

# support_は採用した特徴量と採用しなかった特徴量のプロパティ
print(selector.support_)

  y = column_or_1d(y, warn=True)


[ True  True  True  True  True False False False  True False False  True
 False False False False False  True False False False False  True  True
 False False]


In [9]:
X_new_selected = selector.transform(X_06)
X_new_selected = pd.DataFrame(X_new_selected, columns=X_05_cols[selector.support_])

print('---------------------------------------')
print('X shape after RFE:', X_new_selected.shape)
print('---------------------------------------')
print(X_new_selected.dtypes)
display(X_new_selected.head())


---------------------------------------
X shape after RFE: (614, 10)
---------------------------------------
ApplicantIncome            float64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
Married_No                 float64
Dependents_0               float64
Education_Not Graduate     float64
Property_Area_Rural        float64
Property_Area_Semiurban    float64
dtype: object


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Married_No,Dependents_0,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban
0,5849.0,0.0,146.412162,360.0,1.0,1.0,1.0,0.0,0.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,0.0,0.0,1.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,1.0,1.0,0.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,1.0,1.0,0.0,0.0,0.0
