In [4]:
import os
import math
import gc

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

In [3]:
df = pd.read_csv('../data/processed_data_3.7.csv', compression = 'zip')

In [5]:
feats = [f for f in df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
        
# Split to train and test:
y = df['TARGET']
X = df[feats]
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.mean()).clip(-1e11,1e11)

print("X shape: ", X.shape, "    y shape:", y.shape)
print("\nPreparing data...")

training = y.notnull()
testing = y.isnull()

X_train = X.loc[training,:]
X_test = X.loc[testing,:]
y_train = y.loc[training]

# Scale:

X shape:  (356255, 1524)     y shape: (356255,)

Preparing data...


In [6]:
X.head()

Unnamed: 0.1,Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_1ST_VERSION_DRAWING_DEF_MIN,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_1ST_VERSION_DRAWING_DEF_MEAN,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_1ST_VERSION_DRAWING_DEF_MAX,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_1ST_VERSION_DRAWING_DEF_SUM,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_1ST_VERSION_DRAWING_DEF_VAR,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_TERMINATION_DRAWING_DEF_MIN,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_TERMINATION_DRAWING_DEF_MEAN,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_TERMINATION_DRAWING_DEF_MAX,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_TERMINATION_DRAWING_DEF_SUM,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_TERMINATION_DRAWING_DEF_VAR
0,0.0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
1,1.0,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
2,2.0,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
3,3.0,29686.5,312682.5,297000.0,135000.0,0.006281,0.005808,0.231697,0.304399,0.029995,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
4,4.0,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232


In [10]:
X.isnull().values.any()


True

In [18]:
X.isnull().any()

Unnamed: 0                                                          False
AMT_ANNUITY                                                         False
AMT_CREDIT                                                          False
AMT_GOODS_PRICE                                                     False
AMT_INCOME_TOTAL                                                    False
AMT_REQ_CREDIT_BUREAU_DAY                                           False
AMT_REQ_CREDIT_BUREAU_HOUR                                          False
AMT_REQ_CREDIT_BUREAU_MON                                           False
AMT_REQ_CREDIT_BUREAU_QRT                                           False
AMT_REQ_CREDIT_BUREAU_WEEK                                          False
AMT_REQ_CREDIT_BUREAU_YEAR                                          False
APARTMENTS_AVG                                                      False
APARTMENTS_MEDI                                                     False
APARTMENTS_MODE                       

In [26]:
X.isnull().any()

TypeError: 'numpy.ndarray' object is not callable

In [29]:
list(X.loc[:, X.isnull().any()].columns)

['NEW_RATIO_BURO_CREDIT_ACTIVE_Active_MEAN',
 'NEW_RATIO_BURO_CREDIT_ACTIVE_Bad debt_MEAN',
 'NEW_RATIO_BURO_CREDIT_ACTIVE_Sold_MEAN',
 'NEW_RATIO_BURO_CREDIT_ACTIVE_nan_MEAN',
 'NEW_RATIO_BURO_CREDIT_CURRENCY_nan_MEAN',
 'NEW_RATIO_BURO_CREDIT_TYPE_Mobile operator loan_MEAN',
 'NEW_RATIO_BURO_CREDIT_TYPE_nan_MEAN',
 'REFUSED_DAYS_LENGTH_MIN',
 'REFUSED_DAYS_LENGTH_MAX',
 'REFUSED_DAYS_LENGTH_MEAN',
 'REFUSED_DAYS_LENGTH_1ST_VERSION_MIN',
 'REFUSED_DAYS_LENGTH_1ST_VERSION_MAX',
 'REFUSED_DAYS_LENGTH_1ST_VERSION_MEAN',
 'REFUSED_DAYS_LENGTH_TERMINATION_MIN',
 'REFUSED_DAYS_LENGTH_TERMINATION_MAX',
 'REFUSED_DAYS_LENGTH_TERMINATION_MEAN',
 'REFUSED_NUM_INSTALMENTS_MIN',
 'REFUSED_NUM_INSTALMENTS_MAX',
 'REFUSED_NUM_INSTALMENTS_MEAN',
 'REFUSED_NUM_INSTALMENTS_1ST_VERSION_MIN',
 'REFUSED_NUM_INSTALMENTS_1ST_VERSION_MAX',
 'REFUSED_NUM_INSTALMENTS_1ST_VERSION_MEAN',
 'REFUSED_NUM_INSTALMENTS_TERMINATION_MIN',
 'REFUSED_NUM_INSTALMENTS_TERMINATION_MAX',
 'REFUSED_NUM_INSTALMENTS_TERMINATION

In [30]:
X = X.drop(list(X.loc[:, X.isnull().any()].columns), axis = 1)

In [31]:
scaler = MinMaxScaler()
scaler.fit(X)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [32]:
X.sample(20)

Unnamed: 0.1,Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_1ST_VERSION_DRAWING_DEF_MIN,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_1ST_VERSION_DRAWING_DEF_MEAN,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_1ST_VERSION_DRAWING_DEF_MAX,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_1ST_VERSION_DRAWING_DEF_SUM,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_1ST_VERSION_DRAWING_DEF_VAR,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_TERMINATION_DRAWING_DEF_MIN,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_TERMINATION_DRAWING_DEF_MEAN,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_TERMINATION_DRAWING_DEF_MAX,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_TERMINATION_DRAWING_DEF_SUM,POS_FIRST_LATE_IN_LOAN_TERM_DRAWING_TERMINATION_DRAWING_DEF_VAR
203493,203493.0,28417.5,967428.0,693000.0,144000.0,0.0,0.0,0.0,1.0,0.0,...,100.058533,100.058533,100.058533,100.058533,0.005439,100.059078,100.059078,100.059078,100.059078,0.00232
66271,66271.0,25258.5,778968.0,558000.0,76500.0,0.0,0.0,0.0,1.0,0.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
13701,13701.0,44748.0,1125000.0,1125000.0,171000.0,0.0,0.0,0.0,0.0,1.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
226876,226876.0,17230.5,337500.0,337500.0,112500.0,0.0,0.0,0.0,0.0,0.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
139761,139761.0,37665.0,1288350.0,1125000.0,180000.0,0.0,0.0,0.0,0.0,0.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
211789,211789.0,34294.5,1168452.0,837000.0,121500.0,0.0,0.0,0.0,2.0,0.0,...,100.031572,100.031572,100.031572,100.031572,0.005439,100.035928,100.035928,100.035928,100.035928,0.00232
203839,203839.0,23787.0,540000.0,540000.0,216000.0,0.0,0.0,0.0,0.0,0.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
330485,330485.0,49329.0,497448.0,472500.0,130500.0,0.0,0.0,0.0,0.0,0.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
296732,296732.0,38331.0,1078200.0,900000.0,90000.0,0.0,0.0,0.0,0.0,0.0,...,100.066966,100.072858,100.079022,0.0,0.005439,100.048384,100.051927,100.055862,0.0,0.00232
82038,82038.0,18990.0,720000.0,720000.0,90000.0,0.0,0.0,0.0,0.0,0.0,...,100.066966,100.072858,100.079022,15.407835,0.005439,100.048384,100.051927,100.055862,13.746639,0.00232


In [3]:
train = pd.read_csv('../data/application_train.csv')
test = pd.read_csv('../data/application_test.csv')

In [4]:
train['OCCUPATION_TYPE'].value_counts()

Laborers                 55186
Sales staff              32102
Core staff               27570
Managers                 21371
Drivers                  18603
High skill tech staff    11380
Accountants               9813
Medicine staff            8537
Security staff            6721
Cooking staff             5946
Cleaning staff            4653
Private service staff     2652
Low-skill Laborers        2093
Waiters/barmen staff      1348
Secretaries               1305
Realty agents              751
HR staff                   563
IT staff                   526
Name: OCCUPATION_TYPE, dtype: int64

In [5]:
test['OCCUPATION_TYPE'].value_counts()

Laborers                 8655
Sales staff              5072
Core staff               4361
Managers                 3574
Drivers                  2773
High skill tech staff    1854
Accountants              1628
Medicine staff           1316
Security staff            915
Cooking staff             894
Cleaning staff            656
Private service staff     455
Low-skill Laborers        272
Secretaries               213
Waiters/barmen staff      178
Realty agents             138
HR staff                  104
IT staff                   81
Name: OCCUPATION_TYPE, dtype: int64

In [6]:
train['ORGANIZATION_TYPE'].value_counts()

Business Entity Type 3    67992
XNA                       55374
Self-employed             38412
Other                     16683
Medicine                  11193
Business Entity Type 2    10553
Government                10404
School                     8893
Trade: type 7              7831
Kindergarten               6880
Construction               6721
Business Entity Type 1     5984
Transport: type 4          5398
Trade: type 3              3492
Industry: type 9           3368
Industry: type 3           3278
Security                   3247
Housing                    2958
Industry: type 11          2704
Military                   2634
Bank                       2507
Agriculture                2454
Police                     2341
Transport: type 2          2204
Postal                     2157
Security Ministries        1974
Trade: type 2              1900
Restaurant                 1811
Services                   1575
University                 1327
Industry: type 7           1307
Transpor

In [7]:
test['ORGANIZATION_TYPE'].value_counts()

Business Entity Type 3    10840
XNA                        9274
Self-employed              5920
Other                      2707
Medicine                   1716
Government                 1508
Business Entity Type 2     1479
Trade: type 7              1303
School                     1287
Construction               1039
Kindergarten               1038
Business Entity Type 1      887
Transport: type 4           884
Trade: type 3               578
Military                    530
Industry: type 9            499
Industry: type 3            489
Security                    472
Transport: type 2           448
Police                      441
Housing                     435
Industry: type 11           416
Bank                        374
Security Ministries         341
Services                    302
Postal                      294
Agriculture                 292
Restaurant                  284
Trade: type 2               242
University                  221
Industry: type 7            217
Industry

In [9]:
def missing_values(DataFrame):
    print('The following features are missing values: \n')
    for column in DataFrame.columns:
#         if DataFrame[column].isna().value_counts()[0] < len(DataFrame):
        no_missing = (np.shape(DataFrame)[0] - DataFrame[column].isna().value_counts()[0])/np.shape(DataFrame)[0]
        print('{} missing value counts: {:.2f}% of column missing'.format(column, no_missing*100))

In [10]:
missing_values(train)

The following features are missing values: 

SK_ID_CURR missing value counts: 0.00% of column missing
TARGET missing value counts: 0.00% of column missing
NAME_CONTRACT_TYPE missing value counts: 0.00% of column missing
CODE_GENDER missing value counts: 0.00% of column missing
FLAG_OWN_CAR missing value counts: 0.00% of column missing
FLAG_OWN_REALTY missing value counts: 0.00% of column missing
CNT_CHILDREN missing value counts: 0.00% of column missing
AMT_INCOME_TOTAL missing value counts: 0.00% of column missing
AMT_CREDIT missing value counts: 0.00% of column missing
AMT_ANNUITY missing value counts: 0.00% of column missing
AMT_GOODS_PRICE missing value counts: 0.09% of column missing
NAME_TYPE_SUITE missing value counts: 0.42% of column missing
NAME_INCOME_TYPE missing value counts: 0.00% of column missing
NAME_EDUCATION_TYPE missing value counts: 0.00% of column missing
NAME_FAMILY_STATUS missing value counts: 0.00% of column missing
NAME_HOUSING_TYPE missing value counts: 0.00% 

In [21]:
train[['ORGANIZATION_TYPE', 'OCCUPATION_TYPE', 'SK_ID_CURR', 'DAYS_EMPLOYED']][(train['ORGANIZATION_TYPE']=='XNA')]

Unnamed: 0,ORGANIZATION_TYPE,OCCUPATION_TYPE,SK_ID_CURR,DAYS_EMPLOYED
8,XNA,,100011,365243
11,XNA,,100015,365243
23,XNA,,100027,365243
38,XNA,,100045,365243
43,XNA,,100050,365243
46,XNA,,100053,365243
54,XNA,,100062,365243
56,XNA,,100064,365243
62,XNA,,100073,365243
79,XNA,,100094,365243


In [22]:
train[(train['ORGANIZATION_TYPE']=='XNA')]

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
8,100011,0,Cash loans,F,N,Y,0,112500.000,1019610.0,33826.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
11,100015,0,Cash loans,F,N,Y,0,38419.155,148365.0,10678.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
23,100027,0,Cash loans,F,N,Y,0,83250.000,239850.0,23850.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
38,100045,0,Cash loans,F,N,Y,0,99000.000,247275.0,17338.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
43,100050,0,Cash loans,F,N,Y,0,108000.000,746280.0,42970.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
46,100053,0,Cash loans,F,N,Y,0,202500.000,305221.5,17649.0,...,0,0,0,0,0.0,0.0,0.0,0.0,2.0,4.0
54,100062,0,Cash loans,M,Y,N,0,81000.000,675000.0,32472.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
56,100064,0,Cash loans,F,N,N,0,67500.000,298728.0,15381.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
62,100073,0,Cash loans,M,Y,Y,0,324000.000,1130760.0,40189.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
79,100094,0,Cash loans,F,N,Y,0,108000.000,113760.0,5301.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
binary_features = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
                        'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
                        'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 
                        'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
                        'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
                        'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_11', 
                        'FLAG_DOCUMENT_18', 'CODE_GENDER']

for feature in binary_features:
    train[feature], uniques = pd.factorize(train[feature]) 

In [24]:
train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,0,0,0,0,202500.000,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,1,0,1,0,270000.000,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,0,1,0,0,67500.000,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,1,0,0,0,135000.000,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,0,0,0,0,121500.000,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
5,100008,0,Cash loans,0,0,0,0,99000.000,490495.5,27517.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
6,100009,0,Cash loans,1,1,0,1,171000.000,1560726.0,41301.0,...,0,0,0,0,0.0,0.0,0.0,1.0,1.0,2.0
7,100010,0,Cash loans,0,1,0,0,360000.000,1530000.0,42075.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
8,100011,0,Cash loans,1,0,0,0,112500.000,1019610.0,33826.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
9,100012,0,Revolving loans,0,0,0,0,135000.000,405000.0,20250.0,...,0,0,0,0,,,,,,


In [26]:
prev = pd.read_csv('../data/previous_application.csv')

In [29]:
prev.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [27]:
train = train.merge(prev, on = 'SK_ID_CURR', how = 'left')

In [28]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE_x,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,100002,1,Cash loans,0,0,0,0,202500.0,406597.5,24700.5,...,Auto technology,24.0,low_normal,POS other with interest,365243.0,-565.0,125.0,-25.0,-17.0,0.0
1,100003,0,Cash loans,1,0,1,0,270000.0,1293502.5,35698.5,...,XNA,12.0,low_normal,Cash X-Sell: low,365243.0,-716.0,-386.0,-536.0,-527.0,1.0
2,100003,0,Cash loans,1,0,1,0,270000.0,1293502.5,35698.5,...,Furniture,6.0,middle,POS industry with interest,365243.0,-797.0,-647.0,-647.0,-639.0,0.0
3,100003,0,Cash loans,1,0,1,0,270000.0,1293502.5,35698.5,...,Consumer electronics,12.0,middle,POS household with interest,365243.0,-2310.0,-1980.0,-1980.0,-1976.0,1.0
4,100004,0,Revolving loans,0,1,0,0,67500.0,135000.0,6750.0,...,Connectivity,4.0,middle,POS mobile without interest,365243.0,-784.0,-694.0,-724.0,-714.0,0.0


In [31]:
list(train.columns)

['SK_ID_CURR',
 'TARGET',
 'NAME_CONTRACT_TYPE_x',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT_x',
 'AMT_ANNUITY_x',
 'AMT_GOODS_PRICE_x',
 'NAME_TYPE_SUITE_x',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'OCCUPATION_TYPE',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'WEEKDAY_APPR_PROCESS_START_x',
 'HOUR_APPR_PROCESS_START_x',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'ORGANIZATION_TYPE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGIN