<a href="https://colab.research.google.com/github/brew-brew-com/ML-Prep/blob/main/43_Parameter_Tuning_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 不均衡データへの対応

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
loan_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/input_data/loan_train_data.csv",
                 header=0,
                 dtype={'Dependents':object,
                        'Gender':object,
                        'Married':object,
                        'Education':object,
                        'Self_Employed':object,
                        'Property_Area':object
                       })

In [7]:
#------------------------------------------------------------#
# 説明変数と目的変数の分離
#------------------------------------------------------------#

X_train = loan_data.iloc[:, :-1]    # 最終列以前を説明変数とする
y_train = loan_data.iloc[:, [-1]]   # 最終列を目的変数とする

# display(X_train.head()) ; display(y_train.head()); display(X_train.shape) ; display(y_train.shape); 
# display(X_train.describe()); display(y_train.describe()); display(y_train.groupby(['Loan_Status']).size())

display(X_train.shape)
display(y_train.shape)
display(X_train.head())
display(y_train.head())
display(y_train.groupby(['Loan_Status']).size())

(614, 12)

(614, 1)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


Unnamed: 0,Loan_Status
0,Y
1,N
2,Y
3,Y
4,Y


Loan_Status
N    192
Y    422
dtype: int64

In [8]:
#------------------------------------------------------------#
# データ整形
#------------------------------------------------------------#

X_train = X_train.drop("Loan_ID", axis=1) # Loan_IDはID情報のため削除

class_map = {"N":1, "Y":0}    # 目的変数を数値化（ローン審査でNOとなったデータが1（正例））
y_temp = y_train.copy()
y_temp.loc[:,"Loan_Status"] = y_temp["Loan_Status"].map(class_map)
y_train = y_temp.copy()

display(X_train.shape)
display(y_train.shape)
display(X_train.head())
display(y_train.head())
display(y_train.groupby(['Loan_Status']).size())

(614, 11)

(614, 1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


Unnamed: 0,Loan_Status
0,0
1,1
2,0
3,0
4,0


Loan_Status
0    422
1    192
dtype: int64

In [9]:
#------------------------------------------------------------#
# One-hot Encoding
#------------------------------------------------------------#

ohe_cols = ['Dependents', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
X_train = pd.get_dummies(X_train, dummy_na=True, columns=ohe_cols)     # dummy_na=Trueで欠損値を補完
# X_train_ohe = X_train.copy()   # OHE後のXを一時保存 => Concatination処理へ

display(X_train.head())
display(X_train.columns)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849,0.0,,360.0,1.0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0
1,4583,1508.0,128.0,360.0,1.0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0
3,2583,2358.0,120.0,360.0,1.0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Dependents_nan', 'Gender_Female',
       'Gender_Male', 'Gender_nan', 'Married_No', 'Married_Yes', 'Married_nan',
       'Education_Graduate', 'Education_Not Graduate', 'Education_nan',
       'Self_Employed_No', 'Self_Employed_Yes', 'Self_Employed_nan',
       'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban',
       'Property_Area_nan'],
      dtype='object')

In [13]:
#------------------------------------------------------------#
# 欠損値を補完
#------------------------------------------------------------#

imp = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None) # strategy='mean'は平均値での置き換え、axis=0は行平均を計算
imp.fit(X_train)

X_train = pd.DataFrame(imp.transform(X_train), columns=X_train.columns) # transformで置き換え。arrayで返ってしまうため、もう一度データフレーム化

display(X_train.head())
display(X_train.columns)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Dependents_nan', 'Gender_Female',
       'Gender_Male', 'Gender_nan', 'Married_No', 'Married_Yes', 'Married_nan',
       'Education_Graduate', 'Education_Not Graduate', 'Education_nan',
       'Self_Employed_No', 'Self_Employed_Yes', 'Self_Employed_nan',
       'Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban',
       'Property_Area_nan'],
      dtype='object')

In [18]:
#------------------------------------------------------------#
# ランダムアンダーサンプリング
# ランダムオーバーサンプリング
# SMOTE (Synthtic Minority Over-sampling Technique)
#------------------------------------------------------------#

print(y_train.groupby(['Loan_Status']).size())   # 正例、負例の件数を確認

rus = RandomUnderSampler(random_state=0)   # ランダムアンダーサンプリング
ros = RandomOverSampler(random_state=0)    # ランダムオーバーサンプリング
smt = SMOTE(random_state=0)                # SMOTE (Synthtic Minority Over-sampling Technique) - ランダムではなく既存データから合成して増やす          

# X_under, y_under = rus.fit_sample(X_train, y_train.as_matrix().ravel())
# X_over, y_over = ros.fit_sample(X_train, y_train.as_matrix().ravel())
# X_smt, y_smt = ros.fit_sample(X_train, y_train.as_matrix().ravel())

X_under, y_under = rus.fit_sample(X_train, y_train)
X_over, y_over = ros.fit_sample(X_train, y_train)
X_smt, y_smt = ros.fit_sample(X_train, y_train)

# print('Random Under Sampler',Counter(y_under))
# print('Random Over Sampler', Counter(y_over))
# print('SMOTE', Counter(y_smt))

Loan_Status
0    422
1    192
dtype: int64


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [21]:
# 念のため
X_data = X_train.copy()
y_data = y_train.copy()

# holdout
X_train, X_test, y_train, y_test= train_test_split(X_data, y_data, test_size=0.20, random_state=0)

# y_train = y_train.as_matrix().ravel()
# y_test = y_test.as_matrix().ravel()

# resampling
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
X_train_over, y_train_over = ros.fit_sample(X_train, y_train)
X_train_smt, y_train_smt = smt.fit_sample(X_train, y_train)

# modeling
pipe_gb = Pipeline([('scl',StandardScaler()), ('est',GradientBoostingClassifier(random_state=1))])

# evaluation（fitし直しているのがなんともな。あと、f1 = f masurement score 適合率と再現率の調和平均）
#------------------------------------------------------------#
pipe_gb.fit(X_train,  y_train)
print('Original Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('Original Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print()
#------------------------------------------------------------#
pipe_gb.fit(X_train_under, y_train_under)
print('Undersampling Train:', f1_score(y_train_under, pipe_gb.predict(X_train_under)))
print('Undersampling Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print()
#------------------------------------------------------------#
pipe_gb.fit(X_train_over, y_train_over)
print('Oversampling Train:', f1_score(y_train_over, pipe_gb.predict(X_train_over)))
print('Oversampling Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print()
#------------------------------------------------------------#
pipe_gb.fit(X_train_smt, y_train_smt)
print('SMOTE Train:', f1_score(y_train_smt, pipe_gb.predict(X_train_smt)))
print('SMOTE Test:', f1_score(y_test, pipe_gb.predict(X_test)))
print()
#------------------------------------------------------------#
#
# Original Train: 0.7940074906367042
# Original Test: 0.6181818181818182#
# 
# Undersampling Train: 0.9411764705882353
# Undersampling Test: 0.6024096385542169
# 
# Oversampling Train: 0.9079365079365079
# Oversampling Test: 0.6086956521739131
# 
# SMOTE Train: 0.8990228013029316
# SMOTE Test: 0.5555555555555556
#
# >>> オリジナルが一番いい感じ
#
#------------------------------------------------------------#

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Original Train: 0.8427672955974843
Original Test: 0.6666666666666667

Undersampling Train: 0.989010989010989
Undersampling Test: 0.6779661016949152

Oversampling Train: 0.9559164733178654
Oversampling Test: 0.5599999999999999

SMOTE Train: 0.9307875894988067
SMOTE Test: 0.6511627906976744

