In [None]:
!${HOME}
!${PWD}
!ls -l

In [None]:
import os
import argparse
import numpy as np
import pandas as pd
import random
import warnings
import json
import yaml
from matplotlib import pyplot as plt
import seaborn as sns
import gc
#from kaggle.api.kaggle_api_extended import KaggleApi

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# 機械学習モデル
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost


In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("--exper_name", default="single_model", help="実験名")
parser.add_argument("--dataset_dir", type=str, default="../input/home-credit-default-risk")
parser.add_argument("--results_dir", type=str, default="../output/kaggle/working")
parser.add_argument("--submit_file", type=str, default="submission.csv")
parser.add_argument("--competition_id", type=str, default="home-credit-default-risk")
parser.add_argument("--classifier", choices=["logistic", "knn", "svm", "random_forest", "bagging", "adaboost", "xgboost", "lightgbm", "catboost", "mlp"], default="catboost", help="分類器モデルの種類")
parser.add_argument('--save_checkpoints_dir', type=str, default="checkpoints", help="モデルの保存ディレクトリ")
parser.add_argument("--params_file", type=str, default="")
parser.add_argument('--load_checkpoints_paths', action='append', help="モデルの読み込みファイルのパス")
parser.add_argument("--train_mode", choices=["train", "test", "eval"], default="train", help="")
parser.add_argument('--gdbt_train_type', choices=['train', 'fit'], default="fit", help="GDBTの学習タイプ")
parser.add_argument("--n_splits", type=int, default=4, help="CV での学習用データセットの分割数")
parser.add_argument("--seed", type=int, default=71)
parser.add_argument('--submit', action='store_true')
parser.add_argument('--eda', action='store_true')
parser.add_argument('--debug', action='store_true')
#args = parser.parse_args()
args = parser.parse_args(args=[])

# 実験名を自動的に変更
if( args.exper_name == "single_model" ):
    args.exper_name += "_" + args.classifier
    if( args.params_file != "" ):
        args.exper_name += "_" + args.params_file.split(".")[0]

for key, value in vars(args).items():
    print('%s: %s' % (str(key), str(value)))


# データセットの読み込み

In [None]:
df_application_train = pd.read_csv( os.path.join(args.dataset_dir, "application_train.csv" ) )
df_application_test = pd.read_csv( os.path.join(args.dataset_dir, "application_test.csv" ) )

In [None]:
df_application_train.shape

In [None]:
df_application_train.head()

# 前処理

In [None]:
def rename_columns_levels( df_data, base_name, base_columns_name ):
    # List of column names
    columns = [base_columns_name]

    # Iterate through the variables names
    for var in df_data.columns.levels[0]:
        # Skip the id name
        if var != base_columns_name:            
            # Iterate through the stat names
            for stat in df_data.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append( base_name + '_%s_%s' % (var, stat))

    #print( df_data.columns )
    #print( columns )
    return columns

In [None]:

# 目的変数
target_name = 'TARGET'

#===========================
# 無用なデータを除外（結合前）
#===========================
# application_{train|test}
df_application_train.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'], axis=1, inplace=True)
df_application_test.drop(['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'], axis=1, inplace=True)


In [None]:
df_application_train.shape

## サブ構造の結合[](http://)

### [](http://)bureau

In [None]:
# bureau
df_bureau = pd.read_csv( os.path.join(args.dataset_dir, "bureau.csv" ) )
df_bureau_balance = pd.read_csv( os.path.join(args.dataset_dir, "bureau_balance.csv" ) )

In [None]:
df_bureau_balance.shape

In [None]:
df_bureau_balance.head()

In [None]:
# bureau_balance
for col in df_bureau_balance.columns:
    # ラベル情報のエンコード
    if( df_bureau_balance[col].dtypes == "object" ):
        label_encoder = LabelEncoder()
        label_encoder.fit(list(df_bureau_balance[col]))
        df_bureau_balance[col] = label_encoder.transform(list(df_bureau_balance[col]))

# 同じ SK_ID_BUREAU を集約
df_bureau_balance_agg = df_bureau_balance.groupby('SK_ID_BUREAU', as_index = False).agg(['count', 'mean', 'max', 'min']).reset_index()

del df_bureau_balance
gc.collect()

In [None]:
df_bureau_balance_agg.head()

In [None]:
df_bureau_balance_agg.columns = rename_columns_levels( df_bureau_balance_agg, "bureau_balance", 'SK_ID_BUREAU' )

In [None]:
df_bureau_balance_agg.shape

In [None]:
df_bureau_balance_agg.head()

In [None]:
# SK_ID_CURR と SK_ID_BUREAU を紐付け
df_bureau_balance_agg = pd.merge(df_bureau[['SK_ID_BUREAU', 'SK_ID_CURR']], df_bureau_balance_agg, on='SK_ID_BUREAU', how='left' )

In [None]:
df_bureau_balance_agg.head()

In [None]:
for col in df_bureau.columns:
    # ラベル情報のエンコード
    if( df_bureau[col].dtypes == "object" ):
        label_encoder = LabelEncoder()
        label_encoder.fit(list(df_bureau[col]))
        df_bureau[col] = label_encoder.transform(list(df_bureau[col]))

# 同じ SK_ID_CURR の行を 過去の申込み回数（SK_ID_CURR あたりの SK_ID_BUREAU の個数）,　各々の特徴量の mean, max, min, で集約する。 
df_bureau_agg = df_bureau.drop(columns = ['SK_ID_BUREAU']).groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min']).reset_index()
df_bureau_agg.columns = rename_columns_levels( df_bureau_agg, "bureau", 'SK_ID_CURR' )

In [None]:
df_bureau_agg.shape

In [None]:
df_bureau_agg.head()

In [None]:
# サブ構造を結合
df_bureau = pd.merge(df_bureau, df_bureau_balance_agg, on='SK_ID_CURR', how='left' )

# 不要になったメモリを開放
del df_bureau_balance_agg
gc.collect()

In [None]:
df_bureau.shape

In [None]:
df_bureau.head()

In [None]:
df_bureau = pd.merge(df_bureau, df_bureau_agg, on='SK_ID_CURR', how='left' )

# 不要になったメモリを開放
del df_bureau_agg
gc.collect()

In [None]:
df_bureau.shape

In [None]:
df_bureau.head()

### previous_application

In [None]:
#---------------------------
# previous_application
#---------------------------
df_pos_cash_balance = pd.read_csv( os.path.join(args.dataset_dir, "POS_CASH_balance.csv" ) )

# pos_cash_balance
for col in df_pos_cash_balance.columns:
    # ラベル情報のエンコード
    if( df_pos_cash_balance[col].dtypes == "object" ):
        label_encoder = LabelEncoder()
        label_encoder.fit(list(df_pos_cash_balance[col]))
        df_pos_cash_balance[col] = label_encoder.transform(list(df_pos_cash_balance[col]))

df_pos_cash_balance_agg = df_pos_cash_balance.groupby('SK_ID_PREV', as_index = False).agg(['count', 'mean', 'max', 'min']).reset_index()
df_pos_cash_balance_agg.columns = rename_columns_levels( df_pos_cash_balance_agg, "pos_cash_balance", ["SK_ID_CURR", 'SK_ID_PREV'] )

# 不要になったメモリを開放
del df_pos_cash_balance
gc.collect()

In [None]:
df_pos_cash_balance_agg.shape

In [None]:
df_pos_cash_balance_agg.head()

In [None]:
# installments_payments
df_installments_payments = pd.read_csv( os.path.join(args.dataset_dir, "installments_payments.csv" ) )

for col in df_installments_payments.columns:
    # ラベル情報のエンコード
    if( df_installments_payments[col].dtypes == "object" ):
        label_encoder = LabelEncoder()
        label_encoder.fit(list(df_installments_payments[col]))
        df_installments_payments[col] = label_encoder.transform(list(df_installments_payments[col]))

df_installments_payments_agg = df_installments_payments.groupby('SK_ID_PREV', as_index = False).agg(['count', 'mean', 'max', 'min']).reset_index()
df_installments_payments_agg.columns = rename_columns_levels( df_installments_payments_agg, "installments_payments", ["SK_ID_CURR", 'SK_ID_PREV'] )

# 不要になったメモリを開放
del df_installments_payments
gc.collect()

In [None]:
df_installments_payments_agg.shape

In [None]:
df_installments_payments_agg.head()

In [None]:
# credit_card_balance
df_credit_card_balance = pd.read_csv( os.path.join(args.dataset_dir, "credit_card_balance.csv" ) )

for col in df_credit_card_balance.columns:
    # ラベル情報のエンコード
    if( df_credit_card_balance[col].dtypes == "object" ):
        label_encoder = LabelEncoder()
        label_encoder.fit(list(df_credit_card_balance[col]))
        df_credit_card_balance[col] = label_encoder.transform(list(df_credit_card_balance[col]))

df_credit_card_balance_agg = df_credit_card_balance.groupby('SK_ID_PREV', as_index = False).agg(['count', 'mean', 'max', 'min']).reset_index()
df_credit_card_balance_agg.columns = rename_columns_levels( df_credit_card_balance_agg, "credit_card_balance", ["SK_ID_CURR", 'SK_ID_PREV'] )

# 不要になったメモリを開放
del df_credit_card_balance
gc.collect()

In [None]:
df_credit_card_balance_agg.shape

In [None]:
df_credit_card_balance_agg.head()

In [None]:
# previous_application
df_previous_application = pd.read_csv( os.path.join(args.dataset_dir, "previous_application.csv" ) )

for col in df_previous_application.columns:
    # ラベル情報のエンコード
    if( df_previous_application[col].dtypes == "object" ):
        label_encoder = LabelEncoder()
        label_encoder.fit(list(df_previous_application[col]))
        df_previous_application[col] = label_encoder.transform(list(df_previous_application[col]))

df_previous_application_agg = df_previous_application.groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min']).reset_index()
df_previous_application_agg.columns = rename_columns_levels( df_previous_application_agg, "revious_application", 'SK_ID_CURR' )

In [None]:
df_previous_application_agg.shape

In [None]:
df_previous_application_agg.head()

In [None]:
# サブ構造を結合
df_previous_application = pd.merge(df_previous_application, df_pos_cash_balance_agg, on='SK_ID_PREV', how='left' )
df_previous_application = pd.merge(df_previous_application, df_installments_payments_agg, on='SK_ID_PREV', how='left' )
df_previous_application = pd.merge(df_previous_application, df_credit_card_balance_agg, on='SK_ID_PREV', how='left' )
df_previous_application = pd.merge(df_previous_application, df_previous_application_agg, on='SK_ID_CURR', how='left' )

# 不要になったメモリを開放
del df_pos_cash_balance_agg, df_installments_payments_agg, df_credit_card_balance_agg, df_previous_application_agg
gc.collect()

In [None]:
df_previous_application.shape

In [None]:
df_previous_application.head()

In [None]:
gc.disable()