In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
full_train_orig = pd.read_csv("../Data/application_train.csv")
test =  pd.read_csv("../Data/application_test.csv")

In [3]:
full_train_orig.shape

(307511, 122)

In [4]:
### Load bureau data
bureau = pd.read_csv("../Data/bureau.csv")

bureau_gb = bureau.groupby("SK_ID_CURR").agg({'DAYS_CREDIT':['max', 'min'], 'CREDIT_DAY_OVERDUE':['max', 'min'],
                                             'CNT_CREDIT_PROLONG': ['sum'], 'AMT_CREDIT_SUM':'mean',
                                             'DAYS_CREDIT_UPDATE':['max', 'min'], 'AMT_CREDIT_SUM_OVERDUE':['mean']})
# bureau_gb = bureau.groupby("SK_ID_CURR")[["DAYS_CREDIT", "CREDIT_DAY_OVERDUE", "CNT_CREDIT_PROLONG", "DAYS_CREDIT_UPDATE"
#                              , "AMT_CREDIT_SUM_OVERDUE", "AMT_CREDIT_SUM"]].mean().add_suffix("_bur").reset_index()
bureau_gb.columns = ['_'.join(col) for col in bureau_gb.columns]
bureau_gb = bureau_gb.reset_index()

### OHE categorical features and combine
bureau_cats = pd.get_dummies(bureau.select_dtypes('object'))
bureau_cats['SK_ID_CURR'] = bureau['SK_ID_CURR']
bureau_cats_grouped = bureau_cats.groupby('SK_ID_CURR').agg('sum').reset_index()
bureau_gb = pd.merge(bureau_gb, bureau_cats_grouped, on = 'SK_ID_CURR', how = 'left')

In [5]:
### Load previous application data
prv = pd.read_csv("../Data/previous_application.csv")
prv_gb = prv.groupby("SK_ID_CURR").agg({"AMT_ANNUITY":['mean'], "AMT_APPLICATION":['mean'], "AMT_CREDIT":['mean'],
                                        "AMT_DOWN_PAYMENT":['mean'], "AMT_GOODS_PRICE":['mean'], "SELLERPLACE_AREA":['mean'],
                                        "DAYS_DECISION":['min', 'max'], "DAYS_TERMINATION":['min', 'max'], "DAYS_LAST_DUE":['min', 'max'],
                                        "DAYS_FIRST_DUE":['min', 'max'], "DAYS_LAST_DUE_1ST_VERSION":['min', 'max'],
                                        "SK_ID_PREV":['count']})
prv_gb.columns = ['_prev_'.join(col) for col in prv_gb.columns]
prv_gb = prv_gb.reset_index()

# prv_cats = pd.get_dummies(prv.select_dtypes('object').drop(["NAME_TYPE_SUITE", "WEEKDAY_APPR_PROCESS_START", "NAME_CONTRACT_TYPE"], axis=1))
# prv_cats['SK_ID_CURR'] = prv['SK_ID_CURR']
# prv_cats_grouped = prv_cats.groupby('SK_ID_CURR').agg('sum').reset_index()
# prv_gb = pd.merge(prv_gb, prv_cats_grouped, on = 'SK_ID_CURR', how = 'left')

In [6]:
prv_gb.head()

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY_prev_mean,AMT_APPLICATION_prev_mean,AMT_CREDIT_prev_mean,AMT_DOWN_PAYMENT_prev_mean,AMT_GOODS_PRICE_prev_mean,SELLERPLACE_AREA_prev_mean,DAYS_DECISION_prev_min,DAYS_DECISION_prev_max,DAYS_TERMINATION_prev_min,DAYS_TERMINATION_prev_max,DAYS_LAST_DUE_prev_min,DAYS_LAST_DUE_prev_max,DAYS_FIRST_DUE_prev_min,DAYS_FIRST_DUE_prev_max,DAYS_LAST_DUE_1ST_VERSION_prev_min,DAYS_LAST_DUE_1ST_VERSION_prev_max,SK_ID_PREV_prev_count
0,100001,3951.0,24835.5,23787.0,2520.0,24835.5,23.0,-1740,-1740,-1612.0,-1612.0,-1619.0,-1619.0,-1709.0,-1709.0,-1499.0,-1499.0,1
1,100002,9251.775,179055.0,179055.0,0.0,179055.0,500.0,-606,-606,-17.0,-17.0,-25.0,-25.0,-565.0,-565.0,125.0,125.0,1
2,100003,56553.99,435436.5,484191.0,3442.5,435436.5,533.0,-2341,-746,-1976.0,-527.0,-1980.0,-536.0,-2310.0,-716.0,-1980.0,-386.0,3
3,100004,5357.25,24282.0,20106.0,4860.0,24282.0,30.0,-815,-815,-714.0,-714.0,-724.0,-724.0,-784.0,-784.0,-694.0,-694.0,1
4,100005,4813.2,22308.75,20076.75,4464.0,44617.5,18.0,-757,-315,-460.0,-460.0,-466.0,-466.0,-706.0,-706.0,-376.0,-376.0,2


In [7]:
### installment data 
inst = pd.read_csv("../Data/installments_payments.csv")
inst_gb = inst.groupby("SK_ID_CURR")[["AMT_INSTALMENT","AMT_PAYMENT"]].sum().add_suffix("_inst").reset_index()
inst_gb["diff"] = inst_gb["AMT_INSTALMENT_inst"] - inst_gb["AMT_PAYMENT_inst"]
inst_gb = inst_gb.drop(["AMT_INSTALMENT_inst", "AMT_PAYMENT_inst"], axis=1)

In [8]:
### credit card balance data 
ccb = pd.read_csv("../Data/credit_card_balance.csv")
ccb_gb = ccb[ccb.MONTHS_BALANCE == -1].groupby("SK_ID_CURR")[["AMT_BALANCE", "AMT_CREDIT_LIMIT_ACTUAL", "AMT_DRAWINGS_CURRENT",
                                                    "AMT_RECIVABLE", "AMT_TOTAL_RECEIVABLE"]].sum().add_suffix("_ccb").reset_index()

## Combine bureau data

In [9]:
full_train_orig = pd.merge(full_train_orig, bureau_gb, how="inner", on="SK_ID_CURR")
test = pd.merge(test, bureau_gb, how="left", on="SK_ID_CURR")

In [10]:
full_train_orig.shape

(263491, 154)

## Combine previous application data

In [11]:
full_train_orig = pd.merge(full_train_orig, prv_gb, how="inner", on="SK_ID_CURR")
test = pd.merge(test, prv_gb, how="left", on="SK_ID_CURR")

## Combine installment data

In [12]:
full_train_orig = pd.merge(full_train_orig, inst_gb, how="inner", on="SK_ID_CURR")
test = pd.merge(test, inst_gb, how="left", on="SK_ID_CURR")

In [13]:
full_train_orig.shape

(248721, 172)

## combine credit card balance data 

In [14]:
# full_train_orig = pd.merge(full_train_orig, ccb_gb, how="inner", on="SK_ID_CURR")
# test = pd.merge(test, ccb_gb, how="left", on="SK_ID_CURR")

In [15]:
#full_train_orig.shape

In [16]:
'''
Removing columns with more than 100 null values (filters out 62 columns out of 122 columns)
And not useful (based on EDA) which removes reduces feature size to 43)
'''

null_columns = full_train_orig.columns[full_train_orig.isnull().sum().values > 100000].values.tolist()
correlated_columns = ['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT_W_CITY',
                     'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE']
useless_columns = [ "FLAG_DOCUMENT_2", "FLAG_DOCUMENT_4", "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_7"
                  ,'FLAG_DOCUMENT_9','FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
       'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

In [17]:
### Manually removing columns which doesn't make sense (based on EDA)
full_train = full_train_orig.drop(null_columns+useless_columns+correlated_columns, axis = 1)

In [18]:
#cat_columns_all = set(full_train.columns) - set(full_train._get_numeric_data().columns.tolist())

In [19]:
# cat_feats = ["CODE_GENDER", "NAME_CONTRACT_TYPE", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", 
#              "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "OCCUPATION_TYPE",
#              "NAME_HOUSING_TYPE", "FLAG_DOCUMENT_3"]
# num_feats = ["AMT_INCOME_TOTAL", "CNT_CHILDREN", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", "DAYS_BIRTH",
#              "DAYS_EMPLOYED", "DAYS_REGISTRATION", "DAYS_LAST_PHONE_CHANGE", "AMT_REQ_CREDIT_BUREAU_YEAR", 
#             "REGION_POPULATION_RELATIVE", ]

In [20]:
full_train_y = full_train.TARGET.values
full_train = full_train.drop(["TARGET"], axis = 1)
full_train = full_train.set_index("SK_ID_CURR")
num_feats = full_train._get_numeric_data().columns.values.tolist()
cat_feats = list(set(full_train.columns.values) - set(num_feats))

In [21]:
## Categorical Features - Train
train_cat= full_train[cat_feats]
train_cat = pd.get_dummies(train_cat)

## Numerical Features - Train
train_num = full_train[num_feats]
# train[num_feats].isnull().any() # Checking if the column has any null value
#train_num = train_num.fillna((train_num.mean()), inplace=True)

## Categorical Features - Test
test_cat = test[cat_feats]
test_cat = pd.get_dummies(test_cat)

## Numerical Features - Test
test_num = test[num_feats]
test_num = test_num.fillna(test_num.mean())

# ## Standardize numerical features
# std = StandardScaler().fit(train_num)
# train_num = std.transform(train_num)
# test_num = std.transform(test_num)


In [22]:
full_train_feats = pd.concat([train_num, train_cat], axis=1)
test_feats = pd.concat([test_num, test_cat], axis=1)

In [23]:
full_train_feats = full_train_feats.apply(lambda x: x.fillna(x.mean()),axis=0)
test_feats = test_feats.apply(lambda x: x.fillna(x.mean()),axis=0)

In [24]:
full_train.shape

(248721, 98)

In [25]:
train_X, valid_X, train_y, valid_y = train_test_split(full_train_feats, full_train_y, train_size = 0.8, stratify=full_train_y, random_state=42)

# Random Forest Classifier

In [None]:
# ### RF classifier
# params_rf={
#     'max_depth': [20, 40, 60], #[3,4,5,6,7,8,9], # 5 is good but takes too long in kaggle env
#     'n_estimators': [100, 300, 500], #[1000,2000,3000]
# }

# rf_clf = RandomForestClassifier()
# rf = GridSearchCV(rf_clf,
#                   params_rf,
#                   cv=3,
#                   scoring="roc_auc",
#                   n_jobs=1,
#                   verbose=2)
# rf.fit(train_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1), train_y)
# best_est_rf = rf.best_estimator_
# print(best_est)

In [None]:
valid_probs_rf = rf.predict_proba(valid_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1))[:,1]
valid_preds_rf = rf.predict(valid_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1))

In [None]:
print(accuracy_score(valid_y, valid_preds_rf))
print(roc_auc_score(valid_y, valid_probs_rf))

In [None]:
list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist()))

# XGboost with Grid Search

In [None]:
# params={
#     'max_depth': [3, 4, 5], #[3,4,5,6,7,8,9], # 5 is good but takes too long in kaggle env
#     'subsample': [0.6, 0.8], #[0.4,0.5,0.6,0.7,0.8,0.9,1.0],
#     'colsample_bytree': [0.5, 0.7], #[0.5,0.6,0.7,0.8],
#     'n_estimators': [300, 500, 700], #[1000,2000,3000]
#     'reg_alpha': [0.01, 0.05],  #[0.01, 0.02, 0.03, 0.04]
#     'scale_pos_weight':[1,3, 5]
# }

# xgb_clf = xgb.XGBClassifier(missing=9999999999)
# rs = GridSearchCV(xgb_clf,
#                   params,
#                   cv=3,
#                   scoring="roc_auc",
#                   n_jobs=1,
#                   verbose=2)
# rs.fit(train_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1), train_y)
# best_est = rs.best_estimator_
# print(best_est)

In [None]:
valid_probs_rs = rs.predict_proba(valid_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1))[:,1]
valid_preds_rs= rs.predict(valid_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1))[:,1]
print(accuracy_score(valid_y, valid_preds_rs))
print(roc_auc_score(valid_y, valid_probs_rs))

# Single XGBoost model with best parameters

In [26]:
xgb_single = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=9999999999, n_estimators=500,
       nthread=-1, objective='binary:logistic', reg_alpha=0.05,
       reg_lambda=1, scale_pos_weight=3, seed=0, silent=True,
       subsample=0.8)

xgb_single.fit(train_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1), train_y)
valid_probs_xgb_single = xgb_single.predict_proba(valid_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1))[:,1]
valid_preds_xgb_single = xgb_single.predict(valid_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1))
print(accuracy_score(valid_y, valid_preds_xgb_single))
print(roc_auc_score(valid_y, valid_probs_xgb_single))

0.907829932657
0.770649529077


In [27]:
### Train AUC
train_probs_xgb_single = xgb_single.predict_proba(train_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1))[:,1]
print(roc_auc_score(train_y, train_probs_xgb_single))

0.804013411539


# LightGBM model 

In [None]:
params={
    'max_depth': [3, 5], #[3,4,5,6,7,8,9], # 5 is good but takes too long in kaggle env
    'subsample': [0.4, 0.6], #[0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    'colsample_bytree': [0.5, 0.7], #[0.5,0.6,0.7,0.8],
    'n_estimators': [500, 700], #[1000,2000,3000]
    'reg_alpha': [0.01, 0.05], #[0.01, 0.02, 0.03, 0.04]
    'scale_pos_weight':[1,3], 
    'num_leaves':[30, 50]
    
}

lgb_clf = lgb.LGBMClassifier()
rs = GridSearchCV(lgb_clf,
                  params,
                  cv=3,
                  scoring="roc_auc",
                  n_jobs=-1,
                  verbose=2)
rs.fit(train_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1), train_y)
best_est = rs.best_estimator_
print(best_est)

In [None]:
valid_probs_rs = rs.predict_proba(valid_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1))[:,1]
valid_preds_rs= rs.predict(valid_X.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1))
print(accuracy_score(valid_y, valid_preds_rs))
print(roc_auc_score(valid_y, valid_probs_rs))

In [None]:
rs.best_estimator_

In [None]:
best_model = lgb.LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
        learning_rate=0.1, max_depth=3, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=700,
        n_jobs=-1, num_leaves=30, objective=None, random_state=None,
        reg_alpha=0.05, reg_lambda=0.0, scale_pos_weight=1, silent=True,
        subsample=0.4, subsample_for_bin=200000, subsample_freq=0)

In [None]:
best_model.fit(full_train_feats.drop(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist())), axis=1), full_train_y)

# Prepare Submission file 

In [None]:
### Prepare submission file and save to disk
result_df = pd.DataFrame({'SK_ID_CURR':test.SK_ID_CURR.values, "TARGET":xgb_single.predict_proba(test_feats.drop(list(set(test_feats.columns.tolist()) - set(train_X.columns.tolist())), axis=1))[:,1]})
result_df.to_csv("submimssion_xgbsingle_fulldata_bureau_prev_inst_update2.csv", index=False)

In [None]:
result_df.shape

In [None]:
train_X.drop(np.array(list(set(train_X.columns.tolist()) - set(test_feats.columns.tolist()))), axis=1).columns.values[np.argsort(xgb_single.feature_importances_)]

In [None]:
xgb_cols = list(set(train_X.columns.tolist()).intersection(set(test_feats.columns.tolist())))

In [None]:
pd.DataFrame.from_dict(xgb_single.booster().get_fscore(), orient='index').sort_values(by=0, ascending=False).plot(kind="bar", figsize=(20,10))

In [None]:
pd.DataFrame.from_dict(xgb_single.booster().get_fscore(), orient='index').reset_index().plot(kind="bar", figsize=(12,10))

# EDA

### check correlation among numerical featuers and between target

In [None]:
fig = plt.figure(figsize=(20,20))
sns.heatmap(full_train[full_train.columns[full_train.dtypes != "object"]].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")

In [None]:
full_train.columns

In [None]:
full_train_orig[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].isnull().sum()

In [None]:
full_train_orig.EXT_SOURCE_1

In [None]:
fig = plt.figure(figsize(20,20))
sns.heatmap(train_X.corr())