In [9]:
!${HOME}
!${PWD}
!ls -l

/bin/sh: 1: /root: Permission denied
/bin/sh: 1: /kaggle/working: Permission denied
total 36
---------- 1 root root 34391 May  6 08:14 __notebook_source__.ipynb


In [10]:
import os
import argparse
import numpy as np
import pandas as pd
import random
import warnings
import json
import yaml
from matplotlib import pyplot as plt
import seaborn as sns
import gc
#from kaggle.api.kaggle_api_extended import KaggleApi

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures


# 機械学習モデル
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost


In [11]:
parser = argparse.ArgumentParser()
parser.add_argument("--exper_name", default="single_model", help="実験名")
parser.add_argument("--dataset_dir", type=str, default="../input/home-credit-default-risk")
parser.add_argument("--results_dir", type=str, default="../output/kaggle/working")
parser.add_argument("--submit_file", type=str, default="submission.csv")
parser.add_argument("--competition_id", type=str, default="home-credit-default-risk")
parser.add_argument("--classifier", choices=["logistic", "knn", "svm", "random_forest", "bagging", "adaboost", "xgboost", "lightgbm", "catboost", "mlp"], default="catboost", help="分類器モデルの種類")
parser.add_argument('--save_checkpoints_dir', type=str, default="checkpoints", help="モデルの保存ディレクトリ")
parser.add_argument("--params_file", type=str, default="")
parser.add_argument('--load_checkpoints_paths', action='append', help="モデルの読み込みファイルのパス")
parser.add_argument("--train_mode", choices=["train", "test", "eval"], default="train", help="")
parser.add_argument('--gdbt_train_type', choices=['train', 'fit'], default="fit", help="GDBTの学習タイプ")
parser.add_argument("--n_splits", type=int, default=4, help="CV での学習用データセットの分割数")
parser.add_argument('--onehot_encode', action='store_false')
parser.add_argument("--seed", type=int, default=71)
parser.add_argument('--submit', action='store_true')
parser.add_argument('--eda', action='store_true')
parser.add_argument('--debug', action='store_true')
#args = parser.parse_args()
args = parser.parse_args(args=[])

# 実験名を自動的に変更
if( args.exper_name == "single_model" ):
    args.exper_name += "_" + args.classifier
    if( args.params_file != "" ):
        args.exper_name += "_" + args.params_file.split(".")[0]

for key, value in vars(args).items():
    print('%s: %s' % (str(key), str(value)))


exper_name: single_model_catboost
dataset_dir: ../input/home-credit-default-risk
results_dir: ../output/kaggle/working
submit_file: submission.csv
competition_id: home-credit-default-risk
classifier: catboost
save_checkpoints_dir: checkpoints
params_file: 
load_checkpoints_paths: None
train_mode: train
gdbt_train_type: fit
n_splits: 4
onehot_encode: True
seed: 71
submit: False
eda: False
debug: False


# データセットの読み込み

# 前処理

In [12]:
def agg_dataframe_numric( df_data, agg_column, base_column_name, method = ['count', 'mean', 'max', 'min', 'sum'] ):
    """
    数値型のデータに対して、同じ値を持つ columns を集約したデータフレームを返す
    """
    # Remove id variables other than grouping variable
    for col in df_data:
        if col != agg_column and 'SK_ID' in col:
            df_data = df_data.drop(columns = col)

    #df_data_numric = df_data.select_dtypes('number').copy()
    #df_data_numric[agg_column] = df_data[agg_column].copy()
    df_data_numric = df_data.select_dtypes('number')
    df_data_numric[agg_column] = df_data[agg_column]

    # pd.groupby() で集約
    df_data_numric = df_data_numric.groupby( agg_column, as_index = False ).agg( method ).reset_index()

    # 列名を rename
    new_columns = [agg_column]
    for var in df_data_numric.columns.levels[0]:
        if var != agg_column:            
            for stat in df_data_numric.columns.levels[1][:-1]:
                if( var in base_column_name ):
                    new_columns.append( '%s_%s' % (var, stat))
                else:
                    new_columns.append( base_column_name + '_%s_%s' % (var, stat))

    df_data_numric.columns = new_columns

    # １つの値しか持たない列を除外
    """
    _, idx = np.unique( df_data_numric, axis = 1, return_index=True )
    df_data_numric = df_data_numric.iloc[:, idx]
    """    
    return df_data_numric

In [13]:
def agg_dataframe_categorical( df_data, agg_column, base_column_name, method = ['sum', 'count', 'mean'], one_hot_encode = True ):
    """
    カテゴリ型のデータに対して、同じ値を持つ columns を集約したデータフレームを返す
    """
    #df_data_categorical = df_data.select_dtypes('object').copy()
    #df_data_categorical[agg_column] = df_data[agg_column].copy()
    df_data_categorical = df_data.select_dtypes('object')
    df_data_categorical[agg_column] = df_data[agg_column]
    
    if( one_hot_encode ):
        df_data_categorical = pd.get_dummies( df_data_categorical )
    else:
        for col in df_data_categorical.columns:
            # ラベル情報のエンコード
            if( df_data_categorical[col].dtypes == "object" ):
                label_encoder = LabelEncoder()
                label_encoder.fit(list(df_data_categorical[col]))
                df_data_categorical[col] = label_encoder.transform(list(df_data_categorical[col]))

    # pd.groupby() で集約
    df_data_categorical = df_data_categorical.groupby( agg_column, as_index = False ).agg( method ).reset_index()

    # 列名を rename
    new_columns = [agg_column]
    for var in df_data_categorical.columns.levels[0]:
        if var != agg_column:            
            for stat in df_data_categorical.columns.levels[1][:-1]:
                """
                # カテゴリーデータに対しては、sum は count の意味になる
                if( stat == "sum" ):
                    stat = "count"
                # カテゴリーデータに対しては、mean は count_norm の意味になる
                elif( stat == "mean" ):
                    stat = "count_norm"
                """
                if( var in base_column_name ):
                    new_columns.append( '%s_%s' % (var, stat))
                else:
                    new_columns.append( base_column_name + '_%s_%s' % (var, stat))

    df_data_categorical.columns = new_columns

    # １つの値しか持たない列を除外
    """
    _, idx = np.unique( df_data_categorical, axis = 1, return_index=True )
    df_data_categorical = df_data_categorical.iloc[:, idx]
    """

    return df_data_categorical

In [14]:
# 目的変数
target_name = 'TARGET'
one_hot_encode = args.onehot_encode
    
#===========================
# 無用なデータを除外（結合前）
#===========================
# application_{train|test}
df_application_train = pd.read_csv( os.path.join(args.dataset_dir, "application_train.csv" ) )
df_application_test = pd.read_csv( os.path.join(args.dataset_dir, "application_test.csv" ) )
#df_application_train.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'], axis=1, inplace=True)
#df_application_test.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'], axis=1, inplace=True)

In [16]:
# 元データ
df_train = df_application_train
df_test = df_application_test

## PolynomialFeatures

In [18]:
df_train_poly_features = df_train[ ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET'] ]
df_test_poly_features = df_test[ ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'] ]
df_train_poly_features_target = df_train_poly_features[target_name]
df_train_poly_features = df_train_poly_features.drop(columns = [target_name])

# Need to impute missing values
imputer = SimpleImputer(strategy = 'median')
df_train_poly_features = imputer.fit_transform(df_train_poly_features)
df_test_poly_features = imputer.transform(df_test_poly_features)

# Train the polynomial features and Transform the features
poly_transformer = PolynomialFeatures(degree = 3)
poly_transformer.fit(df_train_poly_features)
df_train_poly_features = poly_transformer.transform(df_train_poly_features)
df_test_poly_features = poly_transformer.transform(df_test_poly_features)

# Create a dataframe of the features 
df_train_poly_features = pd.DataFrame(
    df_train_poly_features, 
    columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])
)
df_train_poly_features[target_name] = df_train_poly_features_target

# Put test features into dataframe
df_test_poly_features = pd.DataFrame(
    df_test_poly_features, 
    columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])
)


In [19]:
df_train_poly_features.head()

Unnamed: 0,1,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH,EXT_SOURCE_1^2,EXT_SOURCE_1 EXT_SOURCE_2,EXT_SOURCE_1 EXT_SOURCE_3,EXT_SOURCE_1 DAYS_BIRTH,EXT_SOURCE_2^2,...,EXT_SOURCE_2^2 EXT_SOURCE_3,EXT_SOURCE_2^2 DAYS_BIRTH,EXT_SOURCE_2 EXT_SOURCE_3^2,EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH,EXT_SOURCE_2 DAYS_BIRTH^2,EXT_SOURCE_3^3,EXT_SOURCE_3^2 DAYS_BIRTH,EXT_SOURCE_3 DAYS_BIRTH^2,DAYS_BIRTH^3,TARGET
0,1.0,0.083037,0.262949,0.139376,-9461.0,0.006895,0.021834,0.011573,-785.612748,0.069142,...,0.009637,-654.152107,0.005108,-346.733022,23536670.0,0.002707,-183.785678,12475600.0,-846859000000.0,1
1,1.0,0.311267,0.622246,0.535276,-16765.0,0.096887,0.193685,0.166614,-5218.396475,0.38719,...,0.207254,-6491.237078,0.178286,-5583.975307,174891600.0,0.153368,-4803.518937,150447500.0,-4712058000000.0,0
2,1.0,0.505998,0.555912,0.729567,-19046.0,0.256034,0.28129,0.369159,-9637.236584,0.309038,...,0.225464,-5885.942404,0.295894,-7724.580288,201657200.0,0.388325,-10137.567875,264650400.0,-6908939000000.0,0
3,1.0,0.505998,0.650442,0.535276,-19005.0,0.256034,0.329122,0.270849,-9616.490669,0.423074,...,0.226462,-8040.528832,0.186365,-6616.894625,234933100.0,0.153368,-5445.325225,193336400.0,-6864416000000.0,0
4,1.0,0.505998,0.322738,0.535276,-19932.0,0.256034,0.163305,0.270849,-10085.550751,0.10416,...,0.055754,-2076.117157,0.092471,-3443.335521,128219000.0,0.153368,-5710.929881,212657000.0,-7918677000000.0,0


In [20]:
# Merge polynomial features into training dataframe
df_train_poly_features['SK_ID_CURR'] = df_train['SK_ID_CURR']
df_train = pd.merge( df_train, df_train_poly_features, on = 'SK_ID_CURR', how = 'left')

# Merge polnomial features into testing dataframe
df_test_poly_features['SK_ID_CURR'] = df_test['SK_ID_CURR']
df_test = pd.merge( df_test, df_test_poly_features, on = 'SK_ID_CURR', how = 'left')


In [21]:
df_train.head()

Unnamed: 0,SK_ID_CURR,TARGET_x,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,EXT_SOURCE_2^2 EXT_SOURCE_3,EXT_SOURCE_2^2 DAYS_BIRTH,EXT_SOURCE_2 EXT_SOURCE_3^2,EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH,EXT_SOURCE_2 DAYS_BIRTH^2,EXT_SOURCE_3^3,EXT_SOURCE_3^2 DAYS_BIRTH,EXT_SOURCE_3 DAYS_BIRTH^2,DAYS_BIRTH^3,TARGET_y
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.009637,-654.152107,0.005108,-346.733022,23536670.0,0.002707,-183.785678,12475600.0,-846859000000.0,1
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.207254,-6491.237078,0.178286,-5583.975307,174891600.0,0.153368,-4803.518937,150447500.0,-4712058000000.0,0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.225464,-5885.942404,0.295894,-7724.580288,201657200.0,0.388325,-10137.567875,264650400.0,-6908939000000.0,0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.226462,-8040.528832,0.186365,-6616.894625,234933100.0,0.153368,-5445.325225,193336400.0,-6864416000000.0,0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.055754,-2076.117157,0.092471,-3443.335521,128219000.0,0.153368,-5710.929881,212657000.0,-7918677000000.0,0


In [None]:
# Align the dataframes
df_train, df_test = df_train.align(df_test, join = 'inner', axis = 1)

In [None]:
df_train.head()

## サブ構造の結合[](http://)

### [](http://)bureau

In [None]:
# bureau
df_bureau = pd.read_csv( os.path.join(args.dataset_dir, "bureau.csv" ) )

In [None]:
df_bureau.shape
df_bureau.head()

In [None]:
df_bureau_agg_numric = agg_dataframe_numric( df_bureau, agg_column = 'SK_ID_CURR', base_column_name = "bureau" )
df_bureau_agg_categorical = agg_dataframe_categorical( df_bureau, agg_column = 'SK_ID_CURR', base_column_name = "bureau", one_hot_encode = one_hot_encode )

In [None]:
print( df_bureau_agg_numric.shape )
df_bureau_agg_numric.head()

In [None]:
print( df_bureau_agg_categorical.shape )
df_bureau_agg_categorical.head()

In [None]:
# 元のデータに統合
df_train = pd.merge(df_train, df_bureau_agg_numric, on='SK_ID_CURR', how='left' )
df_train = pd.merge(df_train, df_bureau_agg_categorical, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_bureau_agg_numric, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_bureau_agg_categorical, on='SK_ID_CURR', how='left' )

In [None]:
# 不要になったメモリを解放
del df_bureau_agg_numric, df_bureau_agg_categorical
gc.collect()

### bureau_balance

In [None]:
df_bureau_balance = pd.read_csv( os.path.join(args.dataset_dir, "bureau_balance.csv" ) )

In [None]:
df_bureau_balance.shape

In [None]:
df_bureau_balance.head()

In [None]:
# 同じ SK_ID_BUREAU を集約
df_bureau_balance_agg_numric = agg_dataframe_numric( df_bureau_balance, agg_column = 'SK_ID_BUREAU', base_column_name = "bureau_balance" )
df_bureau_balance_agg_categorical = agg_dataframe_categorical( df_bureau_balance, agg_column = 'SK_ID_BUREAU', base_column_name = "bureau_balance", one_hot_encode = one_hot_encode )

In [None]:
df_bureau_balance_agg_numric.shape
df_bureau_balance_agg_numric.head()

In [None]:
# 親データ （df_bureau） の 'SK_ID_CURR' に、対応する 'SK_ID_BUREAU' を紐付け
df_bureau_balance_agg_numric = df_bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(df_bureau_balance_agg_numric, on = 'SK_ID_BUREAU', how = 'left')
df_bureau_balance_agg_categorical = df_bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(df_bureau_balance_agg_categorical, on = 'SK_ID_BUREAU', how = 'left')

In [None]:
df_bureau_balance_agg_numric.shape
df_bureau_balance_agg_numric.head()

In [None]:
# １つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約
df_bureau_balance_agg_numric = agg_dataframe_numric( df_bureau_balance_agg_numric.drop(columns = ['SK_ID_BUREAU']), agg_column = 'SK_ID_CURR', base_column_name = "bureau_balance" )
df_bureau_balance_agg_categorical = agg_dataframe_numric( df_bureau_balance_agg_categorical.drop(columns = ['SK_ID_BUREAU']), agg_column = 'SK_ID_CURR', base_column_name = "bureau_balance" )

In [None]:
df_bureau_balance_agg_numric.shape
df_bureau_balance_agg_numric.head()

In [None]:
df_bureau_balance_agg_categorical.shape
df_bureau_balance_agg_categorical.head()

In [None]:
# 元のデータに統合
df_train = pd.merge(df_train, df_bureau_balance_agg_numric, on='SK_ID_CURR', how='left' )
df_train = pd.merge(df_train, df_bureau_balance_agg_categorical, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_bureau_balance_agg_numric, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_bureau_balance_agg_categorical, on='SK_ID_CURR', how='left' )

print( df_train.shape )

In [None]:
# 不要になったメモリを解放
del df_bureau, df_bureau_balance, df_bureau_balance_agg_numric, df_bureau_balance_agg_categorical
gc.collect()

### previous_application

In [None]:
df_previous_application = pd.read_csv( os.path.join(args.dataset_dir, "previous_application.csv" ) )    
df_previous_application_agg_numric = agg_dataframe_numric( df_previous_application, agg_column = 'SK_ID_CURR', base_column_name = "previous_application" )
df_previous_application_agg_categorical = agg_dataframe_categorical( df_previous_application, agg_column = 'SK_ID_CURR', base_column_name = "previous_application", one_hot_encode = one_hot_encode )

In [None]:
df_previous_application_agg_numric.head()

In [None]:
df_previous_application_agg_categorical.head()

In [None]:
# 元のデータに統合
df_train = pd.merge(df_train, df_previous_application_agg_numric, on='SK_ID_CURR', how='left' )
df_train = pd.merge(df_train, df_previous_application_agg_categorical, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_previous_application_agg_numric, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_previous_application_agg_categorical, on='SK_ID_CURR', how='left' )

print( df_train.shape )

In [None]:
# 不要になったメモリを解放
del df_previous_application_agg_numric, df_previous_application_agg_categorical
gc.collect()

### pos_cash_balance

In [None]:
df_pos_cash_balance = pd.read_csv( os.path.join(args.dataset_dir, "POS_CASH_balance.csv" ) )

# 同じ SK_ID_PREV を集約
df_pos_cash_balance_agg_numric = agg_dataframe_numric( df_pos_cash_balance, agg_column = 'SK_ID_PREV', base_column_name = "pos_cash_balance" )
df_pos_cash_balance_agg_categorical = agg_dataframe_categorical( df_pos_cash_balance, agg_column = 'SK_ID_PREV', base_column_name = "pos_cash_balance", one_hot_encode = one_hot_encode )

In [None]:
df_pos_cash_balance_agg_numric.head()

In [None]:
df_pos_cash_balance_agg_categorical.head()

In [None]:
# 親データ の 'SK_ID_CURR' に、対応する 'SK_ID_PREV' を紐付け
df_pos_cash_balance_agg_numric = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_pos_cash_balance_agg_numric, on = 'SK_ID_PREV', how = 'left')
df_pos_cash_balance_agg_categorical = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_pos_cash_balance_agg_categorical, on = 'SK_ID_PREV', how = 'left')

In [None]:
df_pos_cash_balance_agg_numric.head()

In [None]:
df_pos_cash_balance_agg_categorical.head()

In [None]:
# １つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約
df_pos_cash_balance_agg_numric = agg_dataframe_numric( df_pos_cash_balance_agg_numric.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "pos_cash_balance" )
df_pos_cash_balance_agg_categorical = agg_dataframe_numric( df_pos_cash_balance_agg_categorical.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "pos_cash_balance" )

In [None]:
df_pos_cash_balance_agg_numric.head()

In [None]:
df_pos_cash_balance_agg_categorical.head()

In [None]:
# 元のデータに統合
df_train = pd.merge(df_train, df_pos_cash_balance_agg_numric, on='SK_ID_CURR', how='left' )
df_train = pd.merge(df_train, df_pos_cash_balance_agg_categorical, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_pos_cash_balance_agg_numric, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_pos_cash_balance_agg_categorical, on='SK_ID_CURR', how='left' )

print( df_train.shape )

In [None]:
# 不要になったメモリを解放
del df_pos_cash_balance, df_pos_cash_balance_agg_numric, df_pos_cash_balance_agg_categorical
gc.collect()

### installments_payments

In [None]:
df_installments_payments = pd.read_csv( os.path.join(args.dataset_dir, "installments_payments.csv" ) )
df_installments_payments.head()

In [None]:
# 同じ SK_ID_PREV を集約
df_installments_payments_agg_numric = agg_dataframe_numric( df_installments_payments, agg_column = 'SK_ID_PREV', base_column_name = "installments_payments" )
df_installments_payments_agg_numric.head()

In [None]:
# 親データ の 'SK_ID_CURR' に、対応する 'SK_ID_PREV' を紐付け
df_installments_payments_agg_numric = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_installments_payments_agg_numric, on = 'SK_ID_PREV', how = 'left')

In [None]:
df_installments_payments_agg_numric.head()

In [None]:
# １つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約
df_installments_payments_agg_numric = agg_dataframe_numric( df_installments_payments_agg_numric.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "installments_payments" )

In [None]:
df_installments_payments_agg_numric.head()

In [None]:
# 元のデータに統合
df_train = pd.merge(df_train, df_installments_payments_agg_numric, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_installments_payments_agg_numric, on='SK_ID_CURR', how='left' )

print( df_train.shape )

In [None]:
# 不要になったメモリを解放
del df_installments_payments, df_installments_payments_agg_numric
gc.collect()

### credit_card_balance

In [None]:
df_credit_card_balance = pd.read_csv( os.path.join(args.dataset_dir, "credit_card_balance.csv" ) )
df_credit_card_balance.head()

In [None]:
# 同じ SK_ID_PREV を集約
df_credit_card_balance_agg_numric = agg_dataframe_numric( df_credit_card_balance, agg_column = 'SK_ID_PREV', base_column_name = "credit_card_balance" )
df_credit_card_balance_agg_categorical = agg_dataframe_categorical( df_credit_card_balance, agg_column = 'SK_ID_PREV', base_column_name = "credit_card_balance", one_hot_encode = one_hot_encode )

In [None]:
# 親データ の 'SK_ID_CURR' に、対応する 'SK_ID_PREV' を紐付け
df_credit_card_balance_agg_numric = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_credit_card_balance_agg_numric, on = 'SK_ID_PREV', how = 'left')
df_credit_card_balance_agg_categorical = df_previous_application[['SK_ID_PREV', 'SK_ID_CURR']].merge(df_credit_card_balance_agg_categorical, on = 'SK_ID_PREV', how = 'left')

In [None]:
# １つの `SK_ID_CURR` に対して、複数の `SK_ID_BUREAU` が存在することになるので、`SK_ID_CURR` を集約
df_credit_card_balance_agg_numric = agg_dataframe_numric( df_credit_card_balance_agg_numric.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "credit_card_balance" )
df_credit_card_balance_agg_categorical = agg_dataframe_numric( df_credit_card_balance_agg_categorical.drop(columns = ['SK_ID_PREV']), agg_column = 'SK_ID_CURR', base_column_name = "credit_card_balance" )

In [None]:
# 元のデータに統合
df_train = pd.merge(df_train, df_credit_card_balance_agg_numric, on='SK_ID_CURR', how='left' )
df_train = pd.merge(df_train, df_credit_card_balance_agg_categorical, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_credit_card_balance_agg_numric, on='SK_ID_CURR', how='left' )
df_test = pd.merge(df_test, df_credit_card_balance_agg_categorical, on='SK_ID_CURR', how='left' )
df_train.shape

In [None]:
# 不要になったメモリを解放
del df_credit_card_balance, df_credit_card_balance_agg_numric, df_credit_card_balance_agg_categorical
gc.collect()