# [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk)
## Training
### Platform: Python 3, colab.research.google.com

In [0]:
import os
import numpy as np
import pandas as pd
import sys
from joblib import dump, load
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
try:
    from google.colab import drive, files
    if pd.__version__ != "0.24.1":
        !pip install pandas==0.24.1
        import pandas as pd
    os.environ["runtime"] = "hosted"
    drive.mount("/content/gdrive", force_remount=True)
    project_folder = "/content/gdrive/My Drive/Colab Notebooks/kaggle-home-credit-default-risk"
except:
    os.environ["runtime"] = "local"
    project_folder = "."
if project_folder not in sys.path:
    sys.path.append(project_folder)
sys.path

Mounted at /content/gdrive


['',
 '/env/python',
 '/usr/lib/python36.zip',
 '/usr/lib/python3.6',
 '/usr/lib/python3.6/lib-dynload',
 '/usr/local/lib/python3.6/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.6/dist-packages/IPython/extensions',
 '/root/.ipython',
 '/content/gdrive/My Drive/Colab Notebooks/kaggle-home-credit-default-risk']

Load custom modules

In [0]:
import transforms

## Load data

In [4]:
app_train_1 = pd.read_csv("{}/data/featuretools_matrix_train_part_1.csv".format(project_folder), header=0, index_col=0)
app_train_2 = pd.read_csv("{}/data/featuretools_matrix_train_part_2.csv".format(project_folder), header=0, index_col=0)
app_train = pd.concat([app_train_1, app_train_2])
app_train = app_train.head(200000) # limit data; uncomment to train on all data
app_train.shape

(200000, 308)

In [5]:
app_test = pd.read_csv("{}/data/featuretools_matrix_test.csv".format(project_folder), header=0, index_col=0)
app_test.shape

(48744, 307)

In [0]:
assert 0 == sum(app_train.isnull().sum())

In [7]:
app_train.head(5)

Unnamed: 0_level_0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,PERCENTILE(MIN(bureau.DAYS_CREDIT_UPDATE)),PERCENTILE(COUNT(bureau)),PERCENTILE(NUM_UNIQUE(bureau.CREDIT_ACTIVE)),PERCENTILE(NUM_UNIQUE(bureau.CREDIT_CURRENCY)),PERCENTILE(NUM_UNIQUE(bureau.CREDIT_TYPE)),PERCENTILE(MAX(bureau_balance.MONTHS_BALANCE)),PERCENTILE(MEDIAN(bureau_balance.MONTHS_BALANCE)),PERCENTILE(MIN(bureau_balance.MONTHS_BALANCE)),PERCENTILE(COUNT(bureau_balance)),PERCENTILE(NUM_UNIQUE(bureau_balance.STATUS))
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.444073,0.807051,0.641556,0.498305,0.632636,0.524182,0.33216,0.611359,0.849906,0.837561
100003,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.192727,0.530583,0.641556,0.498305,0.632636,0.524182,0.501371,0.501835,0.350101,0.516381
100004,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.668458,0.319087,0.14976,0.498305,0.180045,0.524182,0.501371,0.501835,0.350101,0.516381
100006,29686.5,312682.5,297000.0,135000.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.500178,0.071347,0.641556,0.498305,0.632636,0.524182,0.501371,0.501835,0.350101,0.516381
100007,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.618759,0.201966,0.14976,0.498305,0.180045,0.524182,0.501371,0.501835,0.350101,0.516381


## Helper methods

In [0]:
def view_metrics(y_test, y_pred):
    print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
    print("Precision: {}".format(precision_score(y_test, y_pred)))
    print("Recall: {}".format(recall_score(y_test, y_pred)))
    print("F1: {}".format(f1_score(y_test, y_pred)))
    print("AUC: {}".format(roc_auc_score(y_test, y_pred)))

In [0]:
categorical_cols = [
    "CODE_GENDER", "EMERGENCYSTATE_MODE", "FLAG_CONT_MOBILE",
    "FLAG_DOCUMENT_10", "FLAG_DOCUMENT_11", "FLAG_DOCUMENT_12",
    "FLAG_DOCUMENT_13", "FLAG_DOCUMENT_14", "FLAG_DOCUMENT_15",
    "FLAG_DOCUMENT_16", "FLAG_DOCUMENT_17", "FLAG_DOCUMENT_18",
    "FLAG_DOCUMENT_19", "FLAG_DOCUMENT_2", "FLAG_DOCUMENT_20",
    "FLAG_DOCUMENT_21", "FLAG_DOCUMENT_3", "FLAG_DOCUMENT_4",
    "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_6", "FLAG_DOCUMENT_7",
    "FLAG_DOCUMENT_8", "FLAG_DOCUMENT_9", "FLAG_EMAIL",
    "FLAG_EMP_PHONE", "FLAG_MOBIL", "FLAG_OWN_CAR",
    "FLAG_OWN_REALTY", "FLAG_PHONE", "FLAG_WORK_PHONE", 
    "HOUSETYPE_MODE", "LIVE_CITY_NOT_WORK_CITY", "LIVE_REGION_NOT_WORK_REGION",
    "NAME_CONTRACT_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE", "NAME_INCOME_TYPE", "NAME_TYPE_SUITE",
    "OCCUPATION_TYPE", "ORGANIZATION_TYPE", "REG_CITY_NOT_LIVE_CITY",
    "REG_CITY_NOT_WORK_CITY", "REG_REGION_NOT_LIVE_REGION", "REG_REGION_NOT_WORK_REGION",
    "WALLSMATERIAL_MODE", "WEEKDAY_APPR_PROCESS_START"]

In [10]:
categorical_transforms = {}
for i in categorical_cols:
    cat_column = app_train.loc[:, i].append(app_test.loc[:, i])
    dummies = pd.get_dummies(cat_column, prefix=i)
    dummies_cols = list(dummies.columns)[1:] # drop one category to avoid feature correlation
    categorical_transforms[i] = dummies_cols
dump(categorical_transforms, "{}/categorical_transforms.joblib".format(project_folder))

['/content/gdrive/My Drive/Colab Notebooks/kaggle-home-credit-default-risk/categorical_transforms.joblib']

## Train

In [11]:
# split train/test
y = app_train.loc[:, "TARGET"]
X = app_train.drop(["TARGET"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)
X_train.head(5)

Unnamed: 0_level_0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,PERCENTILE(MIN(bureau.DAYS_CREDIT_UPDATE)),PERCENTILE(COUNT(bureau)),PERCENTILE(NUM_UNIQUE(bureau.CREDIT_ACTIVE)),PERCENTILE(NUM_UNIQUE(bureau.CREDIT_CURRENCY)),PERCENTILE(NUM_UNIQUE(bureau.CREDIT_TYPE)),PERCENTILE(MAX(bureau_balance.MONTHS_BALANCE)),PERCENTILE(MEDIAN(bureau_balance.MONTHS_BALANCE)),PERCENTILE(MIN(bureau_balance.MONTHS_BALANCE)),PERCENTILE(COUNT(bureau_balance)),PERCENTILE(NUM_UNIQUE(bureau_balance.STATUS))
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
124782,31518.0,652500.0,652500.0,135000.0,0.0,0.0,1.0,0.0,0.0,4.0,...,0.237168,0.530583,0.641556,0.498305,0.632636,0.524182,0.132484,0.392477,0.83682,0.516381
317542,22140.0,562491.0,454500.0,90000.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.16354,0.422348,0.146992,0.498195,0.627192,0.513062,0.496022,0.500248,0.271422,0.460572
100888,26446.5,900000.0,900000.0,135000.0,0.0,0.0,0.0,0.0,0.0,6.0,...,0.790937,0.201966,0.14976,0.498305,0.180045,0.524182,0.501371,0.501835,0.350101,0.516381
313942,14751.0,269550.0,225000.0,103500.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.60272,0.79827,0.63913,0.498195,0.627192,0.513062,0.793682,0.78443,0.743882,0.809089
143005,30708.0,601470.0,450000.0,144000.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.92551,0.201966,0.14976,0.498305,0.180045,0.524182,0.919828,0.939099,0.718657,0.176235


In [0]:
transformed_features = None
categorical_transforms = load("{}/categorical_transforms.joblib".format(project_folder))

In [0]:
verbose = False

pipe_list = []
for i in categorical_cols:
    if i in X_train.columns:
        pipe_list.append(
            ("enc_cat_"+i, transforms.CategoricalColInt(
                i, categorical_transforms, verbose=verbose)
            )
        )
pipe_list.append(("k_best_selector", transforms.SelectKBestFeatures(25, verbose)))

pipe_list.append(("model", 
    RandomForestClassifier(n_estimators=100, max_features=None)
))
# pipe_list.append(("model", 
#     SVC(C=1.0, degree=3)
# ))

pipeline = Pipeline(pipe_list)
# model = pipeline.fit(X_train, y_train)

In [14]:
# RandomForestClassifier
parameters = {
    "k_best_selector__k": [100],
    "model__n_estimators": [50, 100, 150],
    "model__max_features": [10, 20, 50]
}

# # SVC
# parameters = {
#     "k_best_selector__k": [7, 10],
#     "model__C": [0.7, 0.5],
#     "model__degree": [2, 3]
# }

grid = GridSearchCV(pipeline, parameters, cv=3, scoring="roc_auc")
grid.fit(X_train, y_train)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('enc_cat_CODE_GENDER', <transforms.CategoricalColInt object at 0x7f98d3a72fd0>), ('enc_cat_EMERGENCYSTATE_MODE', <transforms.CategoricalColInt object at 0x7f98d3a72cc0>), ('enc_cat_FLAG_CONT_MOBILE', <transforms.CategoricalColInt object at 0x7f98d3a72908>), ('enc_cat_FLAG_DOCUMENT_10', <tran...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'k_best_selector__k': [100], 'model__n_estimators': [50, 100, 150], 'model__max_features': [10, 20, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [15]:
grid.best_params_

{'k_best_selector__k': 100,
 'model__max_features': 10,
 'model__n_estimators': 50}

In [16]:
model = grid.best_estimator_
y_pred = model.predict(X_train)
view_metrics(y_train, y_pred)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
AUC: 1.0


In [17]:
y_pred = model.predict(X_test)
view_metrics(y_test, y_pred)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
AUC: 1.0


In [18]:
dump(model, "{}/model.joblib".format(project_folder))

['/content/gdrive/My Drive/Colab Notebooks/kaggle-home-credit-default-risk/model.joblib']

In [19]:
print(len(y_test))
print(sum(y_test))
print(sum(y_pred))

60000
4838.0
4838.0
