# Downloading the data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/gdrive/My Drive/Kaggle'

In [None]:
!pwd

In [None]:
! kaggle competitions download -c home-credit-default-risk

In [None]:
!unzip \*.zip  && rm *.zip

# Data Uploading

In [None]:
import pandas as pd
import numpy as np

In [None]:
pos_cash_balance = pd.read_csv('/content/POS_CASH_balance.csv')
train = pd.read_csv('/content/application_train.csv')
test = pd.read_csv('/content/application_test.csv')
bureau = pd.read_csv('/content/bureau.csv')
bureau_balance = pd.read_csv('/content/bureau_balance.csv')
cc_balance = pd.read_csv('/content/credit_card_balance.csv')
ins_payment = pd.read_csv('/content/installments_payments.csv')
prev_app = pd.read_csv('/content/previous_application.csv')

# Understanding the data

In [None]:
pos_cash_balance.head()

In [None]:
train.head()

In [None]:
bureau.head()

In [None]:
bureau_balance.head()

In [None]:
cc_balance.head()

In [None]:
ins_payment.head()

In [None]:
prev_app.head()

In [None]:
train.shape, test.shape, bureau.shape, bureau_balance.shape, cc_balance.shape, ins_payment.shape, prev_app.shape, pos_cash_balance.shape

In [None]:
combine = train.append(test)
combine.shape

# Preprocessing Bureau data section

First SK_ID_CURR are grouped-by and their aggregates are fetched into the list which then move into the new dataframe of bureau.

In [None]:
bureau_balance.isnull().sum() / bureau.shape[0]

In [None]:
bureau_balance_grouping = bureau_balance.groupby('SK_ID_BUREAU')
bureau_ids = bureau_balance.drop_duplicates('SK_ID_BUREAU')['SK_ID_BUREAU'].tolist()

months_balance_list = []
status_list = []

for id in bureau_ids:
  months_balance_list.append(bureau_balance_grouping.get_group(id)['MONTHS_BALANCE'].agg('sum'))
  status_list.append(np.unique(bureau_balance_grouping.get_group(id)['STATUS']).tolist())

bb_dict = {'SK_ID_BUREAU': bureau_ids,
           'MONTHS_BALANCE':months_balance_list, 
           'STATUS': status_list}
new_bb = pd.DataFrame(bb_dict)
new_bb.head()

In [None]:
len(bureau_ids)

In [None]:
bureau = pd.merge(bureau, new_bb)
bureau.head()

In [None]:
bureau.isnull().sum() / bureau.shape[0]

In [None]:
bureau.describe()

In [None]:
bureau = bureau.drop(['AMT_CREDIT_MAX_OVERDUE', 'AMT_ANNUITY'], axis=1)
bureau.fillna(0, inplace=True)

In [None]:
bureau.head()

In [None]:
bureau.columns

In [None]:
bureau_grouping = bureau.groupby('SK_ID_CURR')
sk_ids = bureau.drop_duplicates('SK_ID_CURR')['SK_ID_CURR'].tolist()

credit_active_list = []
credit_curr_list = []
credit_type_list = []
DAYS_CREDIT_list = []
CREDIT_DAY_OVERDUE_list = []
DAYS_CREDIT_ENDDATE_list = []
DAYS_ENDDATE_FACT_list = [] 
CNT_CREDIT_PROLONG_list = []
AMT_CREDIT_SUM_list = []
AMT_CREDIT_SUM_DEBT_list = []
AMT_CREDIT_SUM_LIMIT_list = []
AMT_CREDIT_SUM_OVERDUE_list = []
DAYS_CREDIT_UPDATE_list = []
MONTHS_BALANCE_list = []
STATUS_list = []
i = 1

from matplotlib.cbook import flatten

print('Unique IDs:', len(sk_ids))

for id in sk_ids:
  credit_active_list.append(np.unique(bureau_grouping.get_group(id)['CREDIT_ACTIVE']).tolist())
  credit_curr_list.append(np.unique(bureau_grouping.get_group(id)['CREDIT_CURRENCY']).tolist())
  credit_type_list.append(np.unique(bureau_grouping.get_group(id)['CREDIT_TYPE']).tolist())
  DAYS_CREDIT_list.append(bureau_grouping.get_group(id)['DAYS_CREDIT'].agg('sum'))
  CREDIT_DAY_OVERDUE_list.append(bureau_grouping.get_group(id)['CREDIT_DAY_OVERDUE'].agg('sum'))
  DAYS_CREDIT_ENDDATE_list.append(bureau_grouping.get_group(id)['DAYS_CREDIT_ENDDATE'].agg('mean'))
  DAYS_ENDDATE_FACT_list.append(bureau_grouping.get_group(id)['DAYS_ENDDATE_FACT'].agg('sum'))
  CNT_CREDIT_PROLONG_list.append(bureau_grouping.get_group(id)['CNT_CREDIT_PROLONG'].agg('sum'))
  AMT_CREDIT_SUM_list.append(bureau_grouping.get_group(id)['AMT_CREDIT_SUM'].agg('mean'))
  AMT_CREDIT_SUM_DEBT_list.append(bureau_grouping.get_group(id)['AMT_CREDIT_SUM_DEBT'].agg('mean'))
  AMT_CREDIT_SUM_LIMIT_list.append(bureau_grouping.get_group(id)['AMT_CREDIT_SUM_LIMIT'].agg('mean'))
  AMT_CREDIT_SUM_OVERDUE_list.append(bureau_grouping.get_group(id)['AMT_CREDIT_SUM_OVERDUE'].agg('mean'))
  DAYS_CREDIT_UPDATE_list.append(bureau_grouping.get_group(id)['DAYS_CREDIT_UPDATE'].agg('mean'))
  MONTHS_BALANCE_list.append(bureau_grouping.get_group(id)['MONTHS_BALANCE'].agg('sum'))
  statuslists = bureau_grouping.get_group(id)['STATUS'].tolist()
  STATUS_list.append(np.unique(list(flatten(statuslists))))

bureau_dict = {'SK_ID_CURR': sk_ids,
               'CREDIT_ACTIVE':credit_active_list, 
               'CREDIT_CURRENCY': credit_curr_list, 
               'DAYS_CREDIT': DAYS_CREDIT_list,
               'CREDIT_DAY_OVERDUE': CREDIT_DAY_OVERDUE_list,
               'DAYS_CREDIT_ENDDATE': DAYS_CREDIT_ENDDATE_list,
               'DAYS_ENDDATE_FACT': DAYS_ENDDATE_FACT_list,
               'CNT_CREDIT_PROLONG': CNT_CREDIT_PROLONG_list, 
               'AMT_CREDIT_SUM': AMT_CREDIT_SUM_list, 
               'AMT_CREDIT_SUM_DEBT': AMT_CREDIT_SUM_DEBT_list,
               'AMT_CREDIT_SUM_OVERDUE': AMT_CREDIT_SUM_OVERDUE_list,
               'CREDIT_TYPE': credit_type_list,
               'DAYS_CREDIT_UPDATE': DAYS_CREDIT_UPDATE_list,
               'MONTHS_BALANCE': MONTHS_BALANCE_list,
               'STATUS': STATUS_list}
      
new_bureau = pd.DataFrame(data=bureau_dict)
new_bureau.head()

In [None]:
new_bureau['CREDIT_ACTIVE'] = new_bureau['CREDIT_ACTIVE'].astype('str').str.replace("'","")
new_bureau['CREDIT_ACTIVE'] = new_bureau['CREDIT_ACTIVE'].astype('str').str.replace("]","")
new_bureau['CREDIT_ACTIVE'] = new_bureau['CREDIT_ACTIVE'].astype('str').str.replace("[","")

new_bureau['CREDIT_CURRENCY'] = new_bureau['CREDIT_CURRENCY'].astype('str').str.replace("'","")
new_bureau['CREDIT_CURRENCY'] = new_bureau['CREDIT_CURRENCY'].astype('str').str.replace("]","")
new_bureau['CREDIT_CURRENCY'] = new_bureau['CREDIT_CURRENCY'].astype('str').str.replace("[","")

new_bureau['CREDIT_TYPE'] = new_bureau['CREDIT_TYPE'].astype('str').str.replace("'","")
new_bureau['CREDIT_TYPE'] = new_bureau['CREDIT_TYPE'].astype('str').str.replace("]","")
new_bureau['CREDIT_TYPE'] = new_bureau['CREDIT_TYPE'].astype('str').str.replace("[","")

new_bureau = pd.concat([new_bureau, 
          new_bureau.CREDIT_ACTIVE.apply(lambda x: pd.Series(x.split(', ')).value_counts()).fillna(0)], 
          axis = 1)
new_bureau = new_bureau.drop('CREDIT_ACTIVE', axis=1)

new_bureau = pd.concat([new_bureau, 
          new_bureau.CREDIT_CURRENCY.apply(lambda x: pd.Series(x.split(', ')).value_counts()).fillna(0)], 
          axis = 1)
new_bureau = new_bureau.drop('CREDIT_CURRENCY', axis=1)

new_bureau = pd.concat([new_bureau, 
          new_bureau.CREDIT_TYPE.apply(lambda x: pd.Series(x.split(', ')).value_counts()).fillna(0)], 
          axis = 1)
new_bureau = new_bureau.drop('CREDIT_TYPE', axis=1)

In [None]:
new_bureau.head()

In [None]:
new_bureau['STATUS'] = new_bureau['STATUS'].astype('str').str.replace("'","")
new_bureau['STATUS'] = new_bureau['STATUS'].astype('str').str.replace("]","")
new_bureau['STATUS'] = new_bureau['STATUS'].astype('str').str.replace("[","")

In [None]:
new_bureau = pd.concat([new_bureau, 
          new_bureau.STATUS.apply(lambda x: pd.Series(x.split(' ')).value_counts()).fillna(0)], 
          axis = 1)
new_bureau = new_bureau.drop('STATUS', axis=1)

In [None]:
new_bureau.head()

In [None]:
new_bureau.to_csv('new_burean.csv', index=False)

# Previous Application data preprocessing

In [None]:
cc_balance.isnull().sum() / cc_balance.shape[0]

In [None]:
cc_balance.head()

In [None]:
len(cc_balance.drop_duplicates('SK_ID_PREV')['SK_ID_PREV'].tolist())

In [None]:
cc_balance_grouping = cc_balance.groupby('SK_ID_CURR')
sk_prev_ids = cc_balance.drop_duplicates('SK_ID_CURR')['SK_ID_CURR'].tolist()

MONTHS_BALANCE_list = []
AMT_TOTAL_RECEIVABLE_list = []
CNT_CURRENT_list = []
CNT_INSTALMENT_MATURE_CUM_list = []
NAME_CONTRACT_STATUS_list = []
SK_DPD_list = []
SK_DPD_DEF_list = []

for id in sk_prev_ids:
  MONTHS_BALANCE_list.append(cc_balance_grouping.get_group(id)['MONTHS_BALANCE'].agg('sum'))
  AMT_TOTAL_RECEIVABLE_list.append(cc_balance_grouping.get_group(id)['AMT_TOTAL_RECEIVABLE'].agg('mean'))
  CNT_CURRENT_list.append(cc_balance_grouping.get_group(id)['CNT_DRAWINGS_CURRENT'].agg('sum'))
  CNT_INSTALMENT_MATURE_CUM_list.append(cc_balance_grouping.get_group(id)['CNT_INSTALMENT_MATURE_CUM'].agg('mean'))
  NAME_CONTRACT_STATUS_list.append(np.unique(cc_balance_grouping.get_group(id)['NAME_CONTRACT_STATUS']).tolist())
  SK_DPD_list.append(cc_balance_grouping.get_group(id)['SK_DPD'].agg('mean'))
  SK_DPD_DEF_list.append(cc_balance_grouping.get_group(id)['SK_DPD_DEF'].agg('mean'))

In [None]:
cc_balance_dict = {'SK_ID_CURR': sk_prev_ids, 
                   'MONTHS_BALANCE': MONTHS_BALANCE_list,
                   'AMT_TOTAL_RECEIVABLE':AMT_TOTAL_RECEIVABLE_list,
                   'CNT_DRAWINGS_CURRENT':CNT_CURRENT_list,
                   'CNT_INSTALMENT_MATURE_CUM':CNT_INSTALMENT_MATURE_CUM_list,
                   'NAME_CONTRACT_STATUS':NAME_CONTRACT_STATUS_list,
                   'SK_DPD':SK_DPD_list,
                   'SK_DPD_DEF':SK_DPD_DEF_list}
new_cc_balance = pd.DataFrame(cc_balance_dict)
new_cc_balance.head()

In [None]:
new_cc_balance.to_csv('new_cc_balance.csv', index=False)

In [None]:
new_cc_balance['NAME_CONTRACT_STATUS'] = new_cc_balance['NAME_CONTRACT_STATUS'].astype('str').str.replace("'","")
new_cc_balance['NAME_CONTRACT_STATUS'] = new_cc_balance['NAME_CONTRACT_STATUS'].astype('str').str.replace("]","")
new_cc_balance['NAME_CONTRACT_STATUS'] = new_cc_balance['NAME_CONTRACT_STATUS'].astype('str').str.replace("[","")

new_cc_balance = pd.concat([new_cc_balance, 
          new_cc_balance.NAME_CONTRACT_STATUS.apply(lambda x: pd.Series(x.split(', ')).value_counts()).fillna(0)], 
          axis = 1)
new_cc_balance = new_cc_balance.drop('NAME_CONTRACT_STATUS', axis=1)

In [None]:
new_bureau.head()

In [None]:
new_cc_balance.head()

In [None]:
combine.head()

# Merging the data

In [None]:
combine = train.append(test)
print(combine.shape)
combine = pd.merge(combine, new_cc_balance, on='SK_ID_CURR', how='left')
combine.shape

In [None]:
combine = pd.merge(combine, new_bureau, on='SK_ID_CURR', how='left')
combine.shape

# Filling the NA values

In [None]:
print(len(combine.select_dtypes('object').columns))
objects = combine.select_dtypes('object').columns
combine[objects] = combine[objects].fillna('Unknown')
combine.select_dtypes('object').columns

In [None]:
print(len(combine.select_dtypes('float64').columns))
float64 = combine.select_dtypes('float64').columns[1:]
combine[float64] = combine[float64].fillna(combine[float64].mean())
combine.select_dtypes('float64').columns

In [None]:
print(len(combine.select_dtypes('int64').columns))
int64 = combine.select_dtypes('int64').columns
combine[int64] = combine[int64].fillna(combine[int64].mean())
combine.select_dtypes('int64').columns

In [None]:
combine.isnull().sum().sum(), combine.shape

In [None]:
combine = pd.get_dummies(combine)
combine.shape

# Data Preparation for modelling

In [None]:
X = combine[combine['TARGET'].isnull()!=True].drop(['TARGET', 'SK_ID_CURR'], axis=1)
y = combine[combine['TARGET'].isnull()!=True]['TARGET'].reset_index(drop=True)

X_test = combine[combine['TARGET'].isnull()==True].drop(['TARGET','SK_ID_CURR'], axis=1)

X.shape, y.shape, X_test.shape

# Random Forest Classifier

In [None]:
err_as = []
err_rs = []
err_ps = []
err_roc = []
err_ll = []

y_pred_tot_lgm = []
features = X.columns
feature_importance_df = pd.DataFrame()

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score, roc_auc_score, log_loss
from sklearn.ensemble import RandomForestClassifier

fold = StratifiedKFold(n_splits=5)
i = 1
for train_index, test_index in fold.split(X, y):
    x_train, x_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y[train_index], y[test_index]
    m = RandomForestClassifier()
    m.fit(x_train, y_train)
    pred_y = m.predict(x_val)
    prob_pred = m.predict_proba(x_val)[:,1]
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = m.feature_importances_
    fold_importance_df["fold"] = i + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    print("Fold ",i, " Accuracy: ",(accuracy_score(pred_y, y_val)))
    print("Fold ",i, " Recall: ",(recall_score(pred_y, y_val)))
    print("Fold ",i, " Precision: ",(precision_score(pred_y, y_val)))
    print("Fold ",i, " ROC AUC: ",(roc_auc_score(y_val, prob_pred)))
    print("Fold ",i, " Logloss: ",(log_loss(y_val, prob_pred)))
    print(confusion_matrix(pred_y, y_val))

    err_as.append(accuracy_score(pred_y, y_val))
    err_rs.append(recall_score(pred_y, y_val))
    err_ps.append(precision_score(pred_y, y_val))
    err_roc.append(roc_auc_score(y_val, prob_pred))
    err_ll.append(log_loss(y_val, prob_pred))

    pred_test = m.predict_proba(X_test)[:,1]
    i = i + 1
    y_pred_tot_lgm.append(pred_test)

In [None]:
print('Mean Accuracy Score on CV-5: ', np.mean(err_as, 0))
print('Mean Precision Score on CV-5: ', np.mean(err_ps, 0))
print('Mean Recall Score on CV-5: ', np.mean(err_rs, 0))
print('Mean ROC AUC Score on CV-5: ', np.mean(err_roc, 0))
print('Mean Logloss Score on CV-5: ', np.mean(err_ll, 0))

# Feature Engineering

In [None]:
all_feat = feature_importance_df[["Feature",
                                  "importance"]].groupby("Feature").mean().sort_values(by="importance", 
                                                                                           ascending=False)
all_feat.reset_index(inplace=True)
important_feat = list(all_feat['Feature'])
all_feat.head(20)

Highly correlated data are neglected, threshold - 98%

In [None]:
df = X[important_feat]
corr_matrix = df.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

high_cor = [column for column in upper.columns if any(upper[column] > 0.98)]
print(len(high_cor))
print(high_cor)

In [None]:
features = [i for i in important_feat if i not in high_cor]
print(len(features))
print(features)

In [None]:
X1 = X[features]
X_test1 = X_test[features]

# Random Forest after feature engineering

In [None]:
err_as = []
err_rs = []
err_ps = []
err_roc = []
err_ll = []

y_pred_tot_lgm = []

fold = StratifiedKFold(n_splits=5)
i = 1
for train_index, test_index in fold.split(X1, y):
    x_train, x_val = X1.iloc[train_index], X1.iloc[test_index]
    y_train, y_val = y[train_index], y[test_index]
    m = RandomForestClassifier()
    m.fit(x_train, y_train)
    pred_y = m.predict(x_val)
    prob_pred = m.predict_proba(x_val)[:,1]
    
    print("Fold ",i, " Accuracy: ",(accuracy_score(pred_y, y_val)))
    print("Fold ",i, " Recall: ",(recall_score(pred_y, y_val)))
    print("Fold ",i, " Precision: ",(precision_score(pred_y, y_val)))
    print("Fold ",i, " ROC AUC: ",(roc_auc_score(y_val, prob_pred)))
    print("Fold ",i, " Logloss: ",(log_loss(y_val, prob_pred)))
    print(confusion_matrix(pred_y, y_val))

    err_as.append(accuracy_score(pred_y, y_val))
    err_rs.append(recall_score(pred_y, y_val))
    err_ps.append(precision_score(pred_y, y_val))
    err_roc.append(roc_auc_score(y_val, prob_pred))
    err_ll.append(log_loss(y_val, prob_pred))

    pred_test = m.predict_proba(X_test1)[:,1]
    i = i + 1
    y_pred_tot_lgm.append(pred_test)

In [None]:
print('Mean Accuracy Score on CV-5: ', np.mean(err_as, 0))
print('Mean Precision Score on CV-5: ', np.mean(err_ps, 0))
print('Mean Recall Score on CV-5: ', np.mean(err_rs, 0))
print('Mean ROC AUC Score on CV-5: ', np.mean(err_roc, 0))
print('Mean Logloss Score on CV-5: ', np.mean(err_ll, 0))

# Logistic Regression

In [None]:
err_as_lreg = []
err_rs_lreg = []
err_ps_lreg = []
err_roc_lreg = []
err_ll_lreg = []

y_pred_tot_logreg = []

from sklearn.linear_model import LogisticRegression

fold = StratifiedKFold(n_splits=5)
i = 1
for train_index, test_index in fold.split(X1, y):
    x_train, x_val = X1.iloc[train_index], X1.iloc[test_index]
    y_train, y_val = y[train_index], y[test_index]
    m = LogisticRegression()
    m.fit(x_train, y_train)
    pred_y = m.predict(x_val)
    prob_pred = m.predict_proba(x_val)[:,1]
    
    print("Fold ",i, " Accuracy: ",(accuracy_score(pred_y, y_val)))
    print("Fold ",i, " Recall: ",(recall_score(pred_y, y_val)))
    print("Fold ",i, " Precision: ",(precision_score(pred_y, y_val)))
    print("Fold ",i, " ROC AUC: ",(roc_auc_score(y_val, prob_pred)))
    print("Fold ",i, " Logloss: ",(log_loss(y_val, prob_pred)))
    print(confusion_matrix(pred_y, y_val))

    err_as_lreg.append(accuracy_score(pred_y, y_val))
    err_rs_lreg.append(recall_score(pred_y, y_val))
    err_ps_lreg.append(precision_score(pred_y, y_val))
    err_roc_lreg.append(roc_auc_score(y_val, prob_pred))
    err_ll_lreg.append(log_loss(y_val, prob_pred))

    pred_test = m.predict_proba(X_test1)[:,1]
    i = i + 1
    y_pred_tot_logreg.append(pred_test)

In [None]:
print('Mean Accuracy Score on CV-5: ', np.mean(err_as_lreg, 0))
print('Mean Precision Score on CV-5: ', np.mean(err_ps_lreg, 0))
print('Mean Recall Score on CV-5: ', np.mean(err_rs_lreg, 0))
print('Mean ROC AUC Score on CV-5: ', np.mean(err_roc_lreg, 0))
print('Mean Logloss Score on CV-5: ', np.mean(err_ll_lreg, 0))

# Submission file of test set for competition

In [None]:
submission = pd.DataFrame()
submission['SK_ID_CURR'] = test['SK_ID_CURR']
submission['TARGET'] = np.mean(y_pred_tot_lgm, 0)
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)