<a href="https://colab.research.google.com/github/brew-brew-com/ML-Prep/blob/main/32_Preprocessing_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# 01.Training Data (Scoring Data) Preparation Process

In [3]:
loan_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/input_data/loan_train_data.csv",
                 header=0,
                 dtype={'Dependents':object,
                        'Gender':object,
                        'Married':object,
                        'Education':object,
                        'Self_Employed':object,
                        'Property_Area':object
                       })

In [4]:
#------------------------------------------------------------#
# 説明変数と目的変数の分離
#------------------------------------------------------------#

X_train = loan_data.iloc[:, :-1]    # 最終列以前を説明変数とする
y_train = loan_data.iloc[:, [-1]]   # 最終列を目的変数とする

# display(X_train.head()) ; display(y_train.head()); display(X_train.shape) ; display(y_train.shape); 
# display(X_train.describe()); display(y_train.describe()); display(y_train.groupby(['Loan_Status']).size())

display(X_train.shape)
display(y_train.shape)
display(X_train.head())
display(y_train.head())
display(y_train.groupby(['Loan_Status']).size())

(614, 12)

(614, 1)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


Unnamed: 0,Loan_Status
0,Y
1,N
2,Y
3,Y
4,Y


Loan_Status
N    192
Y    422
dtype: int64

In [5]:
#------------------------------------------------------------#
# データ整形
#------------------------------------------------------------#

X_train = X_train.drop("Loan_ID", axis=1) # Loan_IDはID情報のため削除

class_map = {"N":1, "Y":0}    # 目的変数を数値化（ローン審査でNOとなったデータが1（正例））
y_temp = y_train.copy()
y_temp.loc[:,"Loan_Status"] = y_temp["Loan_Status"].map(class_map)
y_train = y_temp.copy()

display(X_train.shape)
display(y_train.shape)
display(X_train.head())
display(y_train.head())
display(y_train.groupby(['Loan_Status']).size())

(614, 11)

(614, 1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


Unnamed: 0,Loan_Status
0,0
1,1
2,0
3,0
4,0


Loan_Status
0    422
1    192
dtype: int64

In [6]:
#------------------------------------------------------------#
# One-hot Encoding
#------------------------------------------------------------#

ohe_cols = ['Dependents', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
X_train = pd.get_dummies(X_train, dummy_na=True, columns=ohe_cols)     # dummy_na=Trueで欠損値を補完
X_train_ohe = X_train.copy()   # OHE後のXを一時保存 => Concatination処理へ

display(X_train.head())
display(X_train.columns)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849,0.0,,360.0,1.0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0
1,4583,1508.0,128.0,360.0,1.0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0
3,2583,2358.0,120.0,360.0,1.0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Dependents_nan', 'Gender_Female',
       'Gender_Male', 'Gender_nan', 'Married_No', 'Married_Yes', 'Married_nan',
       'Education_Graduate', 'Education_Not Graduate', 'Education_nan',
       'Self_Employed_No', 'Self_Employed_Yes', 'Self_Employed_nan',
       'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban',
       'Property_Area_nan'],
      dtype='object')

In [7]:
#------------------------------------------------------------#
# 欠損値を補完
#------------------------------------------------------------#

# imp = Imputer(missing_values='NaN', strategy='mean', axis=0) # strategy='mean'は平均値での置き換え、axis=0は行平均を計算
imp = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None)
imp.fit(X_train)

X_train = pd.DataFrame(imp.transform(X_train), columns=X_train.columns) # transformで置き換え。arrayで返ってしまうため、もう一度データフレーム化

display(X_train.head())
display(X_train.columns)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Dependents_nan', 'Gender_Female',
       'Gender_Male', 'Gender_nan', 'Married_No', 'Married_Yes', 'Married_nan',
       'Education_Graduate', 'Education_Not Graduate', 'Education_nan',
       'Self_Employed_No', 'Self_Employed_Yes', 'Self_Employed_nan',
       'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban',
       'Property_Area_nan'],
      dtype='object')

In [8]:
#------------------------------------------------------------#
# 特徴量選択（RFE: Recursive Feature Elimination） 再帰的特徴消去
#------------------------------------------------------------#

# rfe = RFE(estimator=RandomForestClassifier(random_state=0), n_features_to_select=10, step=0.05)
rfe = RFECV(estimator=RandomForestClassifier(random_state=0), step=0.05)
# rfe.fit(X_train, y_train.as_matrix().ravel())
rfe.fit(X_train, y_train)

# support_は採用した特徴量のプロパティ
display(rfe.support_) 

X_train = pd.DataFrame(rfe.transform(X_train), columns=X_train.columns[rfe.support_])  # transformで置き換え。arrayで返ってしまうため、もう一度データフレーム化

display(X_train.head())
display(X_train.columns)

  y = column_or_1d(y, warn=True)


array([ True,  True,  True,  True,  True,  True,  True, False, False,
       False, False,  True, False,  True, False, False,  True, False,
       False,  True, False, False,  True,  True, False, False])

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Gender_Male,Married_No,Education_Graduate,Self_Employed_No,Property_Area_Rural,Property_Area_Semiurban
0,5849.0,0.0,146.412162,360.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Gender_Male', 'Married_No', 'Education_Graduate', 'Self_Employed_No',
       'Property_Area_Rural', 'Property_Area_Semiurban'],
      dtype='object')

In [9]:
#------------------------------------------------------------#
# 次元圧縮（PCA：Principal Component Aanalysis） 主成分分析
#------------------------------------------------------------#

# pca = PCA(n_components=10, random_state=1)
# pca = PCA(random_state=1)
# pca.fit(X_train, y_train.as_matrix().ravel())

# X_train = pd.DataFrame(pca.transform(X_train), columns=X_train.columns) 

# display(X_train.head())
# display(X_train.columns)

# 02.Test Data (Modeling Data) Preparation Process

In [10]:
loan_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/input_data/loan_test_data.csv",
                 header=0,
                 dtype={'Dependents':object,
                        'Gender':object,
                        'Married':object,
                        'Education':object,
                        'Self_Employed':object,
                        'Property_Area':object
                       })

In [11]:
#------------------------------------------------------------#
# 説明変数と目的変数の分離
#------------------------------------------------------------#

X_test = loan_data.iloc[:, :]    # 説明変数をXへ
# テストデータなので目的変数yは存在しない

display(X_test.shape)
display(X_test.head())

# display(X_test.head()) ; display(X_test.describe()); display(X_tests.shape)

(333, 12)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [12]:
#------------------------------------------------------------#
# データ整形
#------------------------------------------------------------#

X_test = X_test.drop("Loan_ID", axis=1) # Loan_IDはID情報のため削除

display(X_test.shape)
display(X_test.head())

(333, 11)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [13]:
#------------------------------------------------------------#
# One-hot Encoding
#------------------------------------------------------------#

cols_ohe = ['Dependents', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
X_test = pd.get_dummies(X_test, dummy_na=True, columns=cols_ohe)     # dummy_na=Trueで欠損値を補完
X_test_ohe = X_test.copy()   # OHE後のXを一時保存 => Concatination処理へ

display(X_test.head())
display(X_test.columns)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_nan,Gender_Female,Gender_Male,Gender_Unknown,Gender_nan,Married_No,Married_Yes,Married_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5720,0,110.0,360.0,1.0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0
1,3076,1500,126.0,360.0,1.0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0
2,5000,1800,208.0,360.0,1.0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0
3,2340,2546,100.0,360.0,,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0
4,3276,0,78.0,360.0,1.0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_nan', 'Gender_Female', 'Gender_Male',
       'Gender_Unknown', 'Gender_nan', 'Married_No', 'Married_Yes',
       'Married_nan', 'Education_Graduate', 'Education_Not Graduate',
       'Education_nan', 'Self_Employed_No', 'Self_Employed_Yes',
       'Self_Employed_nan', 'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban', 'Property_Area_nan'],
      dtype='object')

In [14]:
#------------------------------------------------------------#
# 特徴量の差分を確認
#------------------------------------------------------------#

diff1 = set(X_train_ohe.columns.values) - set(X_test_ohe.columns.values)   # 学習データのみに存在するカラム
diff2 = set(X_test_ohe.columns.values) - set(X_train_ohe.columns.values)   # テストデータのみに存在するカラム

display('学習データにのみ存在: %s' % diff1)
display('テストデータにのみ存在: %s' % diff2)

# 学習データにのみ存在する項目は、テストデータに追加...①
# テストデータにのみ存在する項目は、仕方がないので削除...②

"学習データにのみ存在: {'Dependents_3+'}"

"テストデータにのみ存在: {'Gender_Unknown'}"

In [15]:
#------------------------------------------------------------#
# ①必要なカラムを連結
#------------------------------------------------------------#

fillna_target  = set(X_train_ohe.columns) - set(X_test_ohe.columns)             # 連結するとデータがNaNになるので、0 Paddingするために項目を抽出しておく

cols_train_ohe = pd.DataFrame(None, columns=X_train_ohe.columns, dtype=object)   # Train Dataのヘッダーを用意
X_test_ohe = pd.concat([cols_train_ohe, X_test_ohe])                            # Train DataのヘッダーにTest DataのDetailを連結

X_test_ohe.loc[:, list(fillna_target)] = X_test_ohe.loc[:, list(fillna_target)].fillna(0, axis=1)   # 列方向で0 Padding

display(X_test_ohe.head())
display(X_test_ohe.columns)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan,Gender_Unknown
0,5720,0,110.0,360.0,1.0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0.0
1,3076,1500,126.0,360.0,1.0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0.0
2,5000,1800,208.0,360.0,1.0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0.0
3,2340,2546,100.0,360.0,,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0.0
4,3276,0,78.0,360.0,1.0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0.0


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Dependents_nan', 'Gender_Female',
       'Gender_Male', 'Gender_nan', 'Married_No', 'Married_Yes', 'Married_nan',
       'Education_Graduate', 'Education_Not Graduate', 'Education_nan',
       'Self_Employed_No', 'Self_Employed_Yes', 'Self_Employed_nan',
       'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban',
       'Property_Area_nan', 'Gender_Unknown'],
      dtype='object')

In [16]:
#------------------------------------------------------------#
# ②不要なカラムを削除
#------------------------------------------------------------#

drop_target  = set(X_test_ohe.columns) - set(X_train_ohe.columns)   # テストデータのみに存在する項目を抽出

X_test_ohe = X_test_ohe.drop(list(drop_target), axis=1)   # 列方向で削除

display(X_test_ohe.head())
display(X_test_ohe.columns)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5720,0,110.0,360.0,1.0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0
1,3076,1500,126.0,360.0,1.0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0
2,5000,1800,208.0,360.0,1.0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0
3,2340,2546,100.0,360.0,,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0
4,3276,0,78.0,360.0,1.0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Dependents_nan', 'Gender_Female',
       'Gender_Male', 'Gender_nan', 'Married_No', 'Married_Yes', 'Married_nan',
       'Education_Graduate', 'Education_Not Graduate', 'Education_nan',
       'Self_Employed_No', 'Self_Employed_Yes', 'Self_Employed_nan',
       'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban',
       'Property_Area_nan'],
      dtype='object')

In [17]:
#------------------------------------------------------------#
# カラム順を揃える
#------------------------------------------------------------#

X_test_ohe = X_test_ohe.reindex(X_train_ohe.columns, axis=1)   # テストデータの列の並びを学習データに揃える
X_test = X_test_ohe.copy()

display(X_test.head())
display(X_test.columns)
display(X_train.columns)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5720,0,110.0,360.0,1.0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0
1,3076,1500,126.0,360.0,1.0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0
2,5000,1800,208.0,360.0,1.0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0
3,2340,2546,100.0,360.0,,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0
4,3276,0,78.0,360.0,1.0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Dependents_nan', 'Gender_Female',
       'Gender_Male', 'Gender_nan', 'Married_No', 'Married_Yes', 'Married_nan',
       'Education_Graduate', 'Education_Not Graduate', 'Education_nan',
       'Self_Employed_No', 'Self_Employed_Yes', 'Self_Employed_nan',
       'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban',
       'Property_Area_nan'],
      dtype='object')

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Gender_Male', 'Married_No', 'Education_Graduate', 'Self_Employed_No',
       'Property_Area_Rural', 'Property_Area_Semiurban'],
      dtype='object')

In [18]:
#------------------------------------------------------------#
# 欠損値を補完
#------------------------------------------------------------#

# imp = Imputer(missing_values='NaN', strategy='mean', axis=0) # strategy='mean'は平均値での置き換え、axis=0は行方向での平均を計算
imp = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None)
imp.fit(X_test)

X_test = pd.DataFrame(imp.transform(X_test), columns=X_test.columns) # transformで置き換え。arrayで返ってしまうため、もう一度データフレーム化
    
display(X_test.head())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5720.0,0.0,110.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3076.0,1500.0,126.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5000.0,1800.0,208.0,360.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2340.0,2546.0,100.0,360.0,0.831715,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3276.0,0.0,78.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
#------------------------------------------------------------#
# 特徴量選択（RFE: Recursive Feature Elimination） 再帰的特徴消去
#------------------------------------------------------------#

# support_は採用した特徴量のプロパティ
# display(rfe.support_) 

X_test = pd.DataFrame(rfe.transform(X_test), columns=X_test.columns[rfe.support_])    # RFEのモデルはTrainのものを適用する

display(X_test.head())
display(X_test.columns)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Gender_Male,Married_No,Education_Graduate,Self_Employed_No,Property_Area_Rural,Property_Area_Semiurban
0,5720.0,0.0,110.0,360.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,3076.0,1500.0,126.0,360.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
2,5000.0,1800.0,208.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,2340.0,2546.0,100.0,360.0,0.831715,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,3276.0,0.0,78.0,360.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Gender_Male', 'Married_No', 'Education_Graduate', 'Self_Employed_No',
       'Property_Area_Rural', 'Property_Area_Semiurban'],
      dtype='object')

In [20]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

display(X_train.head())
display(y_train.head())
display(X_test.head())

# X_train.to_csv("/content/drive/MyDrive/Colab Notebooks/temp_data/X_train.csv")
# y_train.to_csv("/content/drive/MyDrive/Colab Notebooks/temp_data/temp_data/y_train.csv")
# X_test.to_csv("/content/drive/MyDrive/Colab Notebooks/temp_data/temp_data/X_test.csv")

(614, 13)
(614, 1)
(333, 13)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Gender_Male,Married_No,Education_Graduate,Self_Employed_No,Property_Area_Rural,Property_Area_Semiurban
0,5849.0,0.0,146.412162,360.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0


Unnamed: 0,Loan_Status
0,0
1,1
2,0
3,0
4,0


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Gender_Male,Married_No,Education_Graduate,Self_Employed_No,Property_Area_Rural,Property_Area_Semiurban
0,5720.0,0.0,110.0,360.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,3076.0,1500.0,126.0,360.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
2,5000.0,1800.0,208.0,360.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,2340.0,2546.0,100.0,360.0,0.831715,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,3276.0,0.0,78.0,360.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0


In [21]:
#------------------------------------------------------------#
# 次元圧縮（PCA：Principal Component Aanalysis） 主成分分析
#------------------------------------------------------------#

# 次元圧縮はモデリングのパイプラインの中で

# X_test = pd.DataFrame(pca.transform(X_test), columns=X_test.columns)    # PCAのモデルはTrainのものを適用する

# display(X_train.head())
# display(X_train.columns)

# 03.Modeling & Prediction Process

In [22]:
#------------------------------------------------------------#
# Prediction（未完）
#------------------------------------------------------------#

pipe_knn = Pipeline([('scl', StandardScaler()), ('est', KNeighborsClassifier())])
pipe_logistic = Pipeline([('scl', StandardScaler()), ('est', LogisticRegression(random_state=1))])
pipe_rf = Pipeline([('scl', StandardScaler()), ('est', RandomForestClassifier(random_state=1))])
pipe_gb = Pipeline([('scl', StandardScaler()), ('est', GradientBoostingClassifier(random_state=1))])
pipe_mlp = Pipeline([('scl', StandardScaler()), ('est', MLPClassifier(hidden_layer_sizes=(5, 3), random_state=1))])

pipe_names = ['KNN', 'Logistic', 'Random Forest', 'Gradient Boosting', 'MLP']
pipe_lines = [pipe_knn, pipe_logistic, pipe_rf, pipe_gb, pipe_mlp]

# for (i, pipe) in enumerate(pipe_lines):
#    pipe.fit(X_train, y_train)
#    print('%s: %.3f'%(pipe_names[i],accuracy_score(y_test, pipe.predict(X_test))))