In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.decomposition import PCA 
import featuretools as ft
# Regression Metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
# Classification metrics
from sklearn.metrics import roc_curve, roc_auc_score
# Keras
import tensorflow as tf
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Activation
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Activation, BatchNormalization
from tensorflow.keras.metrics import AUC
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
# Ignore ConvergenceWarning messages
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter(action='ignore', category=ConvergenceWarning)

In [3]:
# smaller datasets for automated features
ft        = pd.read_csv('train_ft.csv')
train     = pd.read_csv('application_train.csv')
bureau    = pd.read_csv('bureau.csv')
bureaubal = pd.read_csv('bureau_balance.csv')
prev      = pd.read_csv('previous_application.csv')
ccb       = pd.read_csv('credit_card_balance.csv')
insta     = pd.read_csv('installments_payments.csv')
pc        = pd.read_csv('POS_CASH_balance.csv')

In [4]:
ids = ft['SK_ID_CURR'].values
train = train.loc[train['SK_ID_CURR'].isin(ids)]
train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
29,100034,0,Revolving loans,M,N,Y,0,90000.0,180000.0,9000.0,...,0,0,0,0,,,,,,
35,100041,0,Cash loans,F,N,N,0,112500.0,450000.0,44509.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
38,100045,0,Cash loans,F,N,Y,0,99000.0,247275.0,17338.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
50,100058,0,Revolving loans,F,N,Y,0,54000.0,135000.0,6750.0,...,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307463,456203,0,Cash loans,F,N,Y,0,180000.0,247275.0,17338.5,...,0,0,0,0,0.0,0.0,0.0,0.0,2.0,0.0
307469,456209,0,Cash loans,F,N,Y,0,202500.0,703728.0,29943.0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
307489,456233,1,Cash loans,F,N,Y,0,225000.0,521280.0,23089.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0
307495,456239,0,Cash loans,M,Y,N,0,180000.0,808650.0,23773.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
ids = train['SK_ID_CURR'].values
bureau = bureau.loc[bureau['SK_ID_CURR'].isin(ids)]
idsb = bureau['SK_ID_BUREAU'].values
bureaubal = bureaubal.loc[bureaubal['SK_ID_BUREAU'].isin(idsb)]
prev = prev.loc[prev['SK_ID_CURR'].isin(ids)]
ccb  = ccb.loc[ccb['SK_ID_CURR'].isin(ids)]
insta = insta.loc[insta['SK_ID_CURR'].isin(ids)]
pc   = pc.loc[pc['SK_ID_CURR'].isin(ids)]
print('bureau shape: ', bureau.shape)
print('bureaubal shape: ', bureaubal.shape)
print('prev shape: ', prev.shape)
print('ccb shape: ', ccb.shape)
print('insta shape: ', insta.shape)
print('pc shape: ', pc.shape)

bureau shape:  (91298, 17)
bureaubal shape:  (985077, 3)
prev shape:  (139110, 37)
ccb shape:  (90374, 23)
insta shape:  (1136337, 8)
pc shape:  (86056, 8)


In [6]:
tables = [train, bureau, bureaubal,prev,ccb, insta, pc ]
for features in tables:
    cols = features.select_dtypes(include = object).columns
    features[cols] = features[cols].fillna('Missing')
    cols = features.select_dtypes(exclude = object).columns
    features[cols] = features[cols].fillna(0)
    print(features.isnull().sum().sum())

0
0
0
0
0
0
0


In [7]:
train = pd.get_dummies(train, drop_first=True)
bureau = pd.get_dummies(bureau, drop_first= True)
prev = pd.get_dummies(prev, drop_first= True)

In [8]:
# a = group.get_group(lst[2])
# if a.shape[0] >= 5 :
#     a = a[:5]
# else:
#     m99 = np.ones((5-a.shape[0],a.shape[1]))*-99
#     m99 = pd.DataFrame(m99, columns=a.columns)
#     a = a.append(m99)
# a

In [9]:
idl = train['SK_ID_CURR'].values
bureau = bureau.sort_values('DAYS_CREDIT', ascending= False)
lst = bureau['SK_ID_CURR'].values
lst = list(set(lst))
lst.sort()
group = bureau.groupby('SK_ID_CURR')
b = []
j=0
for sk in idl:
    if sk in lst:
        a = group.get_group(lst[j])
        if a.shape[0] >= 5:
            a = a[:5]
        else:
            m99 = np.ones((5-a.shape[0],a.shape[1]))*-99
            m99 = pd.DataFrame(m99, columns=a.columns)
            a   = a.append(m99)
        a = a.drop(['SK_ID_CURR', 'SK_ID_BUREAU'], axis=1)    
        a = a.values.flatten().tolist()
        b.extend(a)
        j += 1
    else:
        m99 = np.ones((5,bureau.shape[1]))*-99
        m99 = pd.DataFrame(m99, columns=bureau.columns)
        m99 = m99.drop(['SK_ID_CURR', 'SK_ID_BUREAU'], axis=1)    
        m99 = m99.values.flatten().tolist()
        b.extend(m99)
b = np.array(b)
b = np.reshape(b,(train.shape[0], 5, bureau.shape[1]-2))

In [10]:
# prev = prev.sort_values('DAYS_DECISION', ascending= False)
# lst = prev['SK_ID_CURR'].values
# lst = list(set(lst))
# lst.sort()
# group = prev.groupby('SK_ID_CURR')
# p = []
# j = 0
# for sk in idl:
#     if sk in lst:
#         a = group.get_group(lst[j])
#         if a.shape[0] >= 5:
#             a = a[:5]
#         else:
#             m99 = np.ones((5-a.shape[0],a.shape[1]))*-99
#             m99 = pd.DataFrame(m99, columns=a.columns)
#             a   = a.append(m99)
#         a = a.drop(['SK_ID_PREV', 'SK_ID_CURR'], axis=1)    
#         a = a.values.flatten().tolist()
#         p.extend(a)
#         j += 1
#     else:
#         m99 = np.ones((5,prev.shape[1]))*-99
#         m99 = pd.DataFrame(m99, columns=prev.columns)
#         m99 = m99.drop(['SK_ID_PREV', 'SK_ID_CURR'], axis=1)    
#         m99 = m99.values.flatten().tolist()
#         p.extend(m99)
# p = np.array(p)
# p = np.reshape(p,(train.shape[0], 5, prev.shape[1]-2, 1))

In [16]:
print('shape of channel(bureau):', b.shape)
# print('shape of channel(previous_app):', p.shape)

shape of channel(bureau): (30000, 5, 28)


In [19]:
y = train['TARGET']
y = to_categorical(y,2)

In [24]:
batch_size = 256
epochs = 100
np.random.seed(5)        

model = Sequential()

#1st LSTM layer
model.add(LSTM(units = 50, input_shape=(b.shape[1],b.shape[2]), return_sequences= True))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.2))

#2nd LSTM layer
model.add(LSTM(50, return_sequences= True))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Flatten())

#FC1
model.add(Dense(units= 128))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.5))

#FC2
model.add(Dense(units= 100, name= 'RNN_feature_extract'))
model.add(BatchNormalization())
model.add(Activation("relu"))

#output FC
model.add(Dense(units= 2, activation='sigmoid'))
model.build()
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['AUC'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, 5, 50)             15800     
_________________________________________________________________
batch_normalization_5 (Batch (None, 5, 50)             200       
_________________________________________________________________
activation_5 (Activation)    (None, 5, 50)             0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 5, 50)             0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 5, 50)             20200     
_________________________________________________________________
batch_normalization_6 (Batch (None, 5, 50)             200       
_________________________________________________________________
activation_6 (Activation)    (None, 5, 50)            

In [25]:
early_stops = EarlyStopping(patience=5, monitor='val_AUC')
mc = ModelCheckpoint('best_model.h5',
                      monitor='val_loss', 
                      verbose=0, 
                      save_best_only=True)
model.fit(b, y, validation_split=0.05, callbacks=[early_stops, mc], batch_size= batch_size, epochs= epochs, verbose=1)

Train on 28500 samples, validate on 1500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


<tensorflow.python.keras.callbacks.History at 0x21fd31bd948>

In [26]:
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer('RNN_feature_extract').output)
intermediate_layer_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7_input (InputLayer)    [(None, 5, 28)]           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 5, 50)             15800     
_________________________________________________________________
batch_normalization_5 (Batch (None, 5, 50)             200       
_________________________________________________________________
activation_5 (Activation)    (None, 5, 50)             0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 5, 50)             0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 5, 50)             20200     
_________________________________________________________________
batch_normalization_6 (Batch (None, 5, 50)             200   

In [27]:
#predict to get featured data
feauture_engg_data = intermediate_layer_model.predict(b)
feauture_engg_data = pd.DataFrame(feauture_engg_data)
print('feauture_engg_data shape:', feauture_engg_data.shape)

feauture_engg_data shape: (30000, 100)


In [28]:
# Renaming columns
new_col = []
for i in range(100):
    new_col.append('rnn_bfeat_%d'%(i+1))
feauture_engg_data.columns = new_col
feauture_engg_data.to_csv('100_RNN_feature.csv', index = False)
feauture_engg_data.head(5)  #The features are unnamed now

Unnamed: 0,rnn_bfeat_1,rnn_bfeat_2,rnn_bfeat_3,rnn_bfeat_4,rnn_bfeat_5,rnn_bfeat_6,rnn_bfeat_7,rnn_bfeat_8,rnn_bfeat_9,rnn_bfeat_10,...,rnn_bfeat_91,rnn_bfeat_92,rnn_bfeat_93,rnn_bfeat_94,rnn_bfeat_95,rnn_bfeat_96,rnn_bfeat_97,rnn_bfeat_98,rnn_bfeat_99,rnn_bfeat_100
0,-3.697015,2.655141,2.388965,-0.518289,1.802971,-2.149947,2.263103,-1.621937,-0.776937,2.712237,...,-3.876725,1.954226,-3.292742,-1.287996,-4.222659,-1.312645,1.765459,-0.900458,-0.299305,0.753115
1,1.987094,-1.452901,-2.980918,0.478162,-3.565486,1.561377,-3.150491,1.609396,-1.797935,-1.942948,...,1.572855,-1.772052,1.885794,1.536217,1.776484,-0.051585,1.766789,2.414809,-0.589594,1.329472
2,1.987094,-1.452901,-2.980918,0.478162,-3.565486,1.561377,-3.150491,1.609396,-1.797935,-1.942948,...,1.572855,-1.772052,1.885794,1.536217,1.776484,-0.051585,1.766789,2.414809,-0.589594,1.329472
3,1.987094,-1.452901,-2.980918,0.478162,-3.565486,1.561377,-3.150491,1.609396,-1.797935,-1.942948,...,1.572855,-1.772052,1.885794,1.536217,1.776484,-0.051585,1.766789,2.414809,-0.589594,1.329472
4,-0.852382,0.673939,0.33804,0.00723,0.348816,-1.605453,0.784238,-0.321284,-0.232204,1.231886,...,-0.960384,0.293257,-0.48214,-0.295637,-1.14575,-0.295358,0.486143,0.56997,-0.251967,-0.007687
