## Train a Baseline Model with NN

In [28]:
from plotly.offline import init_notebook_mode, iplot
from wordcloud import WordCloud
import plotly.graph_objs as go
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.plotly as py
from plotly import tools
from datetime import date
import pandas as pd
import numpy as np 
import seaborn as sns
import random 
import math
import warnings
import time
import sys
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)

pd.set_option("display.max_rows",1001)
pd.set_option("display.max_columns",1001)

path = "./input/"

In [29]:
def timer(start,end):
    m, s = divmod(end-start, 60)
    h, m = divmod(m, 60)
    print("time elapsed: %d:%02d:%02d" % (h, m, s))

In [30]:
app_train = pd.read_csv(path + "application_train.csv")
bureau = pd.read_csv(path + "bureau.csv")
bureau_balance = pd.read_csv(path + "bureau_balance.csv")
credit_card_balance = pd.read_csv(path + "credit_card_balance.csv")
pcb = pd.read_csv(path + "POS_CASH_balance.csv")
previous_application = pd.read_csv(path + "previous_application.csv")
installments_payments = pd.read_csv(path + "installments_payments.csv")

### Dataset Preparation

In [4]:
from sklearn.model_selection import train_test_split 
import lightgbm as lgb

# read the test files 
app_test = pd.read_csv(path + 'application_test.csv')

app_test['is_train'] = 0
app_train['is_train'] = 1

# target variable
Y_train = app_train['TARGET']
trainX = app_train.drop(['TARGET'], axis = 1)

# test ID
test_id = app_test['SK_ID_CURR']
testX = app_test

# merge train and test datasets for preprocessing
data = pd.concat([trainX, testX], axis=0)

### Handelling Categorical Features

In [5]:
# function to obtain Categorical Features
def _get_categorical_features(df):
    feats = [col for col in list(df.columns) if df[col].dtype == 'object']
    return feats

# function to factorize categorical features
def _factorize_categoricals(df, cats):
    for col in cats:
        df[col], _ = pd.factorize(df[col])
    return df 

# function to create dummy variables of categorical features
def _get_dummies(df, cats):
    for col in cats:
        df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
    return df 

# get categorical features
data_cats = _get_categorical_features(data)
prev_app_cats = _get_categorical_features(previous_application)
bureau_cats = _get_categorical_features(bureau)
pcb_cats = _get_categorical_features(pcb)
ccbal_cats = _get_categorical_features(credit_card_balance)

# create additional dummy features - 
previous_application = _get_dummies(previous_application, prev_app_cats)
bureau = _get_dummies(bureau, bureau_cats)
pcb = _get_dummies(pcb, pcb_cats)
credit_card_balance = _get_dummies(credit_card_balance, ccbal_cats)

# factorize the categorical features from train and test data
data = _factorize_categoricals(data, data_cats)

### Feature Engineering

#### Feature Engineering - Previous Applications

In [6]:
## More Feature Ideas Reference : https://www.kaggle.com/ogrellier/good-fun-with-ligthgbm 

## count the number of previous applications for a given ID
prev_apps_count = previous_application[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
previous_application['SK_ID_PREV'] = previous_application['SK_ID_CURR'].map(prev_apps_count['SK_ID_PREV'])

## Average values for all other features in previous applications
prev_apps_avg = previous_application.groupby('SK_ID_CURR').mean()
prev_apps_avg.columns = ['p_' + col for col in prev_apps_avg.columns]
data = data.merge(right=prev_apps_avg.reset_index(), how='left', on='SK_ID_CURR')

#### Feature Engineering - Bureau Data

In [7]:
# Average Values for all bureau features 
bureau_avg = bureau.groupby('SK_ID_CURR').mean()
bureau_avg['buro_count'] = bureau[['SK_ID_BUREAU','SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']
bureau_avg.columns = ['b_' + f_ for f_ in bureau_avg.columns]
data = data.merge(right=bureau_avg.reset_index(), how='left', on='SK_ID_CURR')

#### Feature Engineering - Previous Installments

In [8]:
## count the number of previous installments
cnt_inst = installments_payments[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
installments_payments['SK_ID_PREV'] = installments_payments['SK_ID_CURR'].map(cnt_inst['SK_ID_PREV'])

## Average values for all other variables in installments payments
avg_inst = installments_payments.groupby('SK_ID_CURR').mean()
avg_inst.columns = ['i_' + f_ for f_ in avg_inst.columns]
data = data.merge(right=avg_inst.reset_index(), how='left', on='SK_ID_CURR')

#### Feature Engineering - Pos Cash Balance

In [9]:
### count the number of pos cash for a given ID
pcb_count = pcb[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
pcb['SK_ID_PREV'] = pcb['SK_ID_CURR'].map(pcb_count['SK_ID_PREV'])

## Average Values for all other variables in pos cash
pcb_avg = pcb.groupby('SK_ID_CURR').mean()
data = data.merge(right=pcb_avg.reset_index(), how='left', on='SK_ID_CURR')

#### Feature Engineering - Credit Card Balance

In [10]:
### count the number of previous applications for a given ID
nb_prevs = credit_card_balance[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
credit_card_balance['SK_ID_PREV'] = credit_card_balance['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])

### average of all other columns 
avg_cc_bal = credit_card_balance.groupby('SK_ID_CURR').mean()
avg_cc_bal.columns = ['cc_bal_' + f_ for f_ in avg_cc_bal.columns]
data = data.merge(right=avg_cc_bal.reset_index(), how='left', on='SK_ID_CURR')

### Prepare Final Train and Test data

#### Split data into train and test set

In [11]:
#### prepare final Train X and Test X dataframes 
ignore_features = ['SK_ID_CURR', 'is_train']
relevant_features = [col for col in data.columns if col not in ignore_features]
trainX = data[data['is_train'] == 1][relevant_features]
testX = data[data['is_train'] == 0][relevant_features]

#### Preprocess Train and Test Data : Impute None and Normalization 

In [12]:
def _preprocess(dtrain, dtest):
    print('Start Preprocessing', end='\t')
    bgn_time = time.time()

    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))
    
    end_time = time.time()
    timer(bgn_time, end_time)

    return dtrain, dtest

def _preprocess_log(dtrain, dtest):
    print('Start Preprocessing with Log Transformation', end='\t')
    bgn_time = time.time()

    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # log transform of min-zero columns
    dtrain_col_min = dtrain.min(axis=0)
    zero_min_index = dtrain_col_min[dtrain_col_min >= 0].index

    dtrain[zero_min_index] = np.log10(dtrain[zero_min_index] + 1.0)
    dtest[zero_min_index] = np.log10(dtest[zero_min_index] + 1.0)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    end_time = time.time()
    timer(bgn_time, end_time)

    return dtrain, dtest

most of the features are composed of null values

In [13]:
data.shape

(356255, 372)

In [14]:
np.sum(data.isnull().sum() > data.shape[1]*0.9)

307

In [15]:
# preprocessing with log transformation
# X_train, X_test = _preprocess_log(trainX, testX)

# preprocessing 
X_train, X_test = _preprocess(trainX, testX)

Start Preprocessing	time elapsed: 0:00:08


Column `p_NAME_GOODS_CATEGORY_House Construction` all None

In [46]:
X_train.drop(['p_NAME_GOODS_CATEGORY_House Construction'],axis=1, inplace=True)

### Build Layers of NN model

#### Build model using Keras

In [48]:
from sklearn.metrics import log_loss, roc_auc_score

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras import regularizers
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from keras import metrics

Using TensorFlow backend.


In [49]:
def nn_create_model(x, reg):
    model = Sequential()
    model.add(Dense(128, input_dim=x.shape[1], activation='relu', kernel_regularizer=regularizers.l2(reg)))
    model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(reg)))
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(reg)))

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=[metrics.binary_accuracy])
    return model

In [50]:
def nn_run_model(model, dtrain, dtest, batch_size=64, nb_epochs=20, patience=5):
    if dtest:
        early_stop = EarlyStopping(monitor='val_loss', patience=patience, verbose=0, mode='auto')
        model.fit(dtrain[0], dtrain[1], batch_size=batch_size, epochs=nb_epochs,
                  callbacks=[early_stop], validation_data=dtest, verbose=2)
        y_train_pred, y_test_pred = model.predict(dtrain[0]), model.predict(dtest[0])
        y_train_loss, y_test_loss = log_loss(dtrain[1], y_train_pred), log_loss(dtest[1], y_test_pred)        
        y_train_auc, y_test_auc = roc_auc_score(dtrain[1], y_train_pred), roc_auc_score(dtest[1], y_test_pred)        
        return model, y_train_loss, y_test_loss, y_train_auc, y_test_auc
    else:
        model.fit(dtrain[0], dtrain[1], batch_size=batch_size, epochs=nb_epochs, verbose=2)
        y_train_pred = model.predict(dtrain[0])
        y_train_loss = log_loss(dtrain[1], y_train_pred)
        return model, y_train_loss

#### Train model with train-validation-test split

In [51]:
def nn_train_val_split(preprocess='linear', reg=0.01, batch_size=256, nb_epochs=50, patience=5):
    
    train_x, val_x, train_y, val_y = train_test_split(X_train, Y_train, test_size=0.2, random_state=18)
    
    bgn_time = time.time()
        
    clf = nn_create_model(train_x, reg)
    clf, train_loss, val_loss, train_auc, val_auc = nn_run_model(clf, (train_x, train_y), (val_x, val_y), batch_size, nb_epochs, patience)
        
    print("train_loss: {0:.6f}, val_loss: {1:.6f}, train_auc: {2:.6f}, val_auc:{3:.6f}".format(train_loss, val_loss, train_auc, val_auc), end="\t")
        
    end_time = time.time()
    timer(bgn_time, end_time)

#### Run NN model

In [52]:
nn_train_val_split(preprocess='linear')

Train on 246008 samples, validate on 61503 samples
Epoch 1/50
 - 6s - loss: 0.4482 - binary_accuracy: 0.9188 - val_loss: 0.2742 - val_binary_accuracy: 0.9200
Epoch 2/50
 - 6s - loss: 0.2740 - binary_accuracy: 0.9191 - val_loss: 0.2721 - val_binary_accuracy: 0.9200
Epoch 3/50
 - 6s - loss: 0.2737 - binary_accuracy: 0.9191 - val_loss: 0.2732 - val_binary_accuracy: 0.9200
Epoch 4/50
 - 6s - loss: 0.2736 - binary_accuracy: 0.9191 - val_loss: 0.2728 - val_binary_accuracy: 0.9200
Epoch 5/50
 - 6s - loss: 0.2733 - binary_accuracy: 0.9191 - val_loss: 0.2716 - val_binary_accuracy: 0.9200
Epoch 6/50
 - 6s - loss: 0.2728 - binary_accuracy: 0.9191 - val_loss: 0.2706 - val_binary_accuracy: 0.9200
Epoch 7/50
 - 6s - loss: 0.2723 - binary_accuracy: 0.9191 - val_loss: 0.2700 - val_binary_accuracy: 0.9200
Epoch 8/50
 - 6s - loss: 0.2720 - binary_accuracy: 0.9191 - val_loss: 0.2708 - val_binary_accuracy: 0.9200
Epoch 9/50
 - 6s - loss: 0.2720 - binary_accuracy: 0.9191 - val_loss: 0.2692 - val_binary_acc