In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc
from matplotlib_venn import venn2
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import scipy
import itertools
from sklearn.preprocessing import MinMaxScaler, Imputer

In [None]:
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
from  fastai.structured import *
from fastai.column_data import *

In [None]:
table_names = ['application_train', 'bureau', 'bureau_balance', 'POS_CASH_balance', 
               'credit_card_balance', 'previous_application', 'application_test', 'installments_payments']

In [None]:
tables = [pd.read_csv(f'../input/{fname}.csv', low_memory=False) for fname in table_names]

In [None]:
for table in tables:
    display(table.head())

In [None]:
application_train, bureau, bureau_balance, POS_CASH_balance, credit_card_balance, previous_application, application_test, installments_payments = tables

In [None]:
application_test.info()

In [None]:
len(application_train), len(application_test)

In [None]:
def join_df(left, right, left_on, right_on=None, suffix='_y'):
    if right_on is None: right_on = left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on, 
                      suffixes=("", suffix))

In [None]:
gc.collect()

bureau = join_df(bureau, bureau_balance, 'SK_ID_BUREAU')
del bureau_balance
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
venn2([set(bureau['SK_ID_BUREAU'].unique()), set(bureau_balance['SK_ID_BUREAU'].unique())], set_labels = ('bureau', 'bureau_balance') )
plt.title("SK_ID_BUREAU in bureau and bureau_balance", fontsize=15)
plt.show()

# Begin application_train

In [None]:
plt.figure(figsize=(10,7))
venn2([set(bureau['SK_ID_CURR'].unique()), set(application_train['SK_ID_CURR'].unique())], set_labels = ('bureau', 'application_train') )
plt.title("SK_ID_CURR in bureau and application_train", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
venn2([set(application_test['SK_ID_CURR'].unique()), set(application_train['SK_ID_CURR'].unique())], set_labels = ('application_test', 'application_train') )
plt.title("SK_ID_CURR in application_test and application_train", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
venn2([set(previous_application['SK_ID_CURR'].unique()), set(application_train['SK_ID_CURR'].unique())], set_labels = ('previous_application', 'application_train') )
plt.title("SK_ID_CURR in previous_application and application_train", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
venn2([set(POS_CASH_balance['SK_ID_CURR'].unique()), set(application_train['SK_ID_CURR'].unique())], set_labels = ('POS_CASH_balance', 'application_train') )
plt.title("SK_ID_CURR in POS_CASH_balance and application_train", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
venn2([set(installments_payments['SK_ID_CURR'].unique()), set(application_train['SK_ID_CURR'].unique())], set_labels = ('installments_payments', 'application_train') )
plt.title("SK_ID_CURR in installments_payments and application_train", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
venn2([set(credit_card_balance['SK_ID_CURR'].unique()), set(application_train['SK_ID_CURR'].unique())], set_labels = ('credit_card_balance', 'application_train') )
plt.title("SK_ID_CURR in credit_card_balance and application_train", fontsize=15)
plt.show()

# End application_train

# Begin previous_application

In [None]:
plt.figure(figsize=(10,7))
venn2([set(previous_application['SK_ID_PREV'].unique()), set(POS_CASH_balance['SK_ID_PREV'].unique())], set_labels = ('previous_application', 'POS_CASH_balance') )
plt.title("SK_ID_PREV in previous_application and POS_CASH_balance", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
venn2([set(previous_application['SK_ID_PREV'].unique()), set(installments_payments['SK_ID_PREV'].unique())], set_labels = ('previous_application', 'installments_payments') )
plt.title("SK_ID_PREV in previous_application and installments_payments", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
venn2([set(previous_application['SK_ID_PREV'].unique()), set(credit_card_balance['SK_ID_PREV'].unique())], set_labels = ('previous_application', 'credit_card_balance') )
plt.title("SK_ID_PREV in previous_application and credit_card_balance", fontsize=15)
plt.show()

In [None]:
gc.collect()

In [None]:
application_train.info()

In [None]:
cat_len = 0
for col in application_train.columns:
    if application_train[col].dtype == 'object':
        cat_len += len(application_train[col].unique())
print (cat_len)

In [None]:
application_train = pd.get_dummies(application_train, dummy_na=True)
application_test = pd.get_dummies(application_test, dummy_na=True)

In [None]:
target_var = application_train['TARGET']

application_train, application_test = application_train.align(application_test, join = 'inner', axis = 1)

application_train['TARGET'] = target_var

print('Training Features shape: ', application_train.shape)
print('Testing Features shape: ', application_test.shape)

In [None]:
application_train.info()

In [None]:
train_ids = application_train['SK_ID_CURR']
test_ids = application_test['SK_ID_CURR']
    
    # Extract the labels for training
labels = application_train['TARGET']
    
    # Remove the ids and target
train_data = application_train.drop(columns = ['SK_ID_CURR', 'TARGET'])
test_data = application_test.drop(columns = ['SK_ID_CURR'])

print ('Missing data (train, test):\t', (train_data.isna().sum().sum(), test_data.isna().sum().sum()))

In [None]:
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train_data)

# Transform both training and testing data
train_data = imputer.transform(train_data)
test_data = imputer.transform(test_data)

# Repeat with the scaler
scaler.fit(train_data)
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)

print('Training data shape: ', train_data.shape)
print('Testing data shape: ', test_data.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = LogisticRegression(C = 0.001, random_state=42)

log_reg.fit(train_data, target_var)

In [None]:
logistic_preds = log_reg.predict_proba(test_data)[:, 1]

In [None]:
log_reg_baseline = application_test[['SK_ID_CURR']]
log_reg_baseline['TARGET'] = logistic_preds

log_reg_baseline.head()

In [None]:
log_reg_baseline.to_csv('log_reg_baseline.csv', index = False)

In [None]:
train_index, valid_index = train_test_split(np.arange(len(train_data)), test_size=0.2, random_state=42)

In [None]:
y_train = application_train.loc[train_index, 'TARGET']
y_valid = application_train.loc[valid_index, 'TARGET']

In [None]:
x_train = train_data[train_index]
x_valid = train_data[valid_index]

In [None]:
dtrain = lgb.Dataset(x_train, label=y_train)
dvalid = lgb.Dataset(x_valid, label=y_valid)

In [None]:
rounds = 16000
early_stop_rounds = 500
params = {
    'objective' : 'binary',
    'metric' : 'auc',
    'num_leaves' : 32,
    'max_depth': 15,
    'learning_rate' : 0.02,
    'feature_fraction' : 0.6,
    'verbosity' : -1
}

In [None]:
evals_result = {}
model = lgb.train(params, dtrain, 
                  valid_sets=[dtrain, dvalid], 
                  valid_names=['train', 'valid'],
                  num_boost_round=rounds, 
                  early_stopping_rounds=early_stop_rounds, 
                  verbose_eval=500)

In [None]:
gc.collect()

In [None]:
lgb_predictions = model.predict(test_data)

In [None]:
lgb_baseline = application_test[['SK_ID_CURR']]
lgb_baseline['TARGET'] = lgb_predictions

lgb_baseline.head()

In [None]:
lgb_baseline.to_csv('lgb_baseline.csv', index = False)

In [None]:
best_random_params = {'is_unbalance': True,
'n_estimators': 2673,
'num_leaves': 77,
'learning_rate': 0.007641070180129345,
'min_child_samples': 460,
'boosting_type': 'gbdt',
'subsample_for_bin': 240000,
'reg_lambda': 0.2040816326530612,
'reg_alpha': 0.8775510204081632,
'subsample': 0.9494949494949496,
'colsample_bytree': 0.7333333333333333,
 'objective' : 'binary',
 'metric' : 'auc'}

In [None]:
evals_result = {}
best_random_model = lgb.train(best_random_params, dtrain, 
                  valid_sets=[dtrain, dvalid], 
                  valid_names=['train', 'valid'],
                  num_boost_round=rounds, 
                  early_stopping_rounds=early_stop_rounds, 
                  verbose_eval=500)

In [None]:
best_random_preds = best_random_model.predict(test_data)
best_random_baseline = application_test[['SK_ID_CURR']]
best_random_baseline['TARGET'] = best_random_preds

best_random_baseline.head()

In [None]:
best_random_baseline.to_csv('best_random_baseline.csv', index = False)

In [None]:
blend_rand_lgb = application_test[['SK_ID_CURR']]
blend_rand_lgb['TARGET'] = (best_random_preds + lgb_predictions)/2

blend_rand_lgb.head()

In [None]:
blend_rand_lgb.to_csv('blend_rand_lgb.csv', index = False)

In [None]:
blend_3 = application_test[['SK_ID_CURR']]
blend_3['TARGET'] = (best_random_preds + lgb_predictions + logistic_preds)/3

blend_3.head()

In [None]:
blend_3.to_csv('blend_3.csv', index = False)

In [None]:
best_bayes_params = {
    'is_unbalance': True,
'n_estimators': 1327,
'num_leaves': 106,
'learning_rate': 0.0126346500398102,
'min_child_samples': 390,
'boosting_type': 'gbdt',
'subsample_for_bin': 80000,
'reg_lambda': 0.38268769901820565,
'reg_alpha': 0.5129992714397862,
'subsample': 0.7177561548329953,
'colsample_bytree': 0.6149378064887835,
'objective' : 'binary',
'metric' : 'auc'}

In [None]:
best_bayes_model = lgb.train(best_bayes_params, dtrain, 
                  valid_sets=[dtrain, dvalid], 
                  valid_names=['train', 'valid'],
                  num_boost_round=rounds, 
                  early_stopping_rounds=early_stop_rounds, 
                  verbose_eval=500)

In [None]:
best_bayes_preds = best_bayes_model.predict(test_data)
best_bayes_baseline = application_test[['SK_ID_CURR']]
best_bayes_baseline['TARGET'] = best_bayes_preds

best_bayes_baseline.head()

In [None]:
best_bayes_baseline.to_csv('best_bayes_baseline.csv', index = False)

In [None]:
blend_rand_bayes = application_test[['SK_ID_CURR']]
blend_rand_bayes['TARGET'] = (best_random_preds + best_bayes_preds)/2

blend_rand_bayes.head()

In [None]:
blend_rand_bayes.to_csv('blend_rand_bayes.csv', index = False)

In [None]:
FileLink('blend_rand_bayes.csv')