In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
%matplotlib inline

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

# Introduction

This project is from Kaggle's [Home Credit Defualt Risk Competition](https://www.kaggle.com/c/home-credit-default-risk). There are a few datasets that represent the loan borrowers attribute but this project mainly focused on datasets such as **application** and **previous_application**. The **application** dataset contains information about the current ongoing loan that we seek to predict whether the borrowers will repay. On the other hand, **previous_application** contains records of loan borrowers in the past and they reveal information regarding the borrower's previous loans.

The purpose of this competition is to use loan borrower's features to predict their likeliness to repay the loan. Some features that characterize the loan borrowers in the **application** dataset are:

* FLAG_OWN_CAR: flag if the client owns a car
* FLAG_OWN_REALITY: flag if the client owns a house
* AMT_INCOME_TOTAL: client's income
* AMT_CREDIT: Credit amount of the loan
* AMT_ANNUITY: Loan annuity
* EXT_SOURCE_1: Normalized credit score from external data source 1
* EXT_SOURCE_2: Normalized credit score from external data source 2
* EXT_SOURCE_3: Normalized credit score from external data source 3

Some features that characterize the loan borrowers in the **previous_application** dataset are:

* NAME_CONTRACT_STATUS: Contract status (approved, refused) of previous loan application
* AMT_CREDIT: See above
* AMT_ANNUITY: See above

In addition to the features mentioned above, there are a plenty more included in the dataset. For further explanations and variable exploration, please refer to the Kaggle's documentation.

The structure of this analysis is organized as below
1. Loading dataset and simple data preprocessing
2. Baseline model creation using LightGBM - using only **applications** dataset
3. Feature engineering in **applications** dataset and creating a second model
4. Combining the **applications** and **prev_applications** dataset to create a third model
5. Summary and ideas going forward

# 1. Load Dataset

* Load dataset from the data directory

In [2]:
#load dataset
df_train = pd.read_csv('data/application_train.csv')
df_test = pd.read_csv('data/application_test.csv')
#print shape
print(df_train.shape)
print(df_test.shape)

(307511, 122)
(48744, 121)


# 2. Dataset Preprocessing

* There are 121 columns that describe an observation of a row in the dataset
* Out of a total of 307,511 observations in the training dataset, about 92% are **false**
* Set null values to -999
* Factorize categorical values after combining the training and the test dataset
* Re-split the training and the test dataset

In [113]:
df_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
#distribution of target values
df_train['TARGET'].value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [115]:
#set missing values to -999
df_train.fillna(-999, inplace=True)
#combine dataframe
df_combined = pd.concat([df_train, df_test])
#print shape
print(df_combined.shape)

(356255, 122)


In [116]:
#factorize object columns
obj_columns = [col for col in df_combined.columns if df_combined[col].dtypes == 'object']

for col in obj_columns:
    df_combined[col] = pd.factorize(df_combined[col])[0]

In [117]:
#separate training and test dataframes
df_train = df_combined[~df_combined['TARGET'].isnull()]
df_test = df_combined[df_combined['TARGET'].isnull()]
#print shape
print(df_train.shape)
print(df_test.shape)

(307511, 122)
(48744, 122)


# 3. Baseline Testing Using LightGBM

* Using default hyperparameters
* Only concerned with **applications** dataset
* We get validation AUC score of 0.757, which can be improved feature engineering

In [118]:
#divide training dataset
df_fit_x = df_train[[col for col in df_train.columns if col != 'TARGET']]
df_fit_y = df_train['TARGET']

In [119]:
#create training/validation dataset
x_train, x_valid, y_train, y_valid = train_test_split(df_fit_x, df_fit_y, test_size = 0.3, random_state = 1000)

In [120]:
#baseline classifier
clf = LGBMClassifier(n_estimators=1000)
#fit model
clf.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_valid, y_valid)], 
        eval_metric = 'auc', verbose = 100, early_stopping_rounds = 50)

[100]	training's auc: 0.803845	training's binary_logloss: 0.232693	valid_1's auc: 0.757026	valid_1's binary_logloss: 0.247861


LGBMClassifier(n_estimators=1000)

In [121]:
#observe feature importance
df_importance = pd.DataFrame(sorted(zip(x_train.columns, clf.feature_importances_)), columns = ['feature', 'importance']).sort_values(by='importance', ascending=False)
df_importance.head(10)

Unnamed: 0,feature,importance
38,EXT_SOURCE_3,202
37,EXT_SOURCE_2,194
22,DAYS_BIRTH,191
36,EXT_SOURCE_1,168
1,AMT_CREDIT,157
0,AMT_ANNUITY,134
2,AMT_GOODS_PRICE,123
24,DAYS_ID_PUBLISH,113
23,DAYS_EMPLOYED,113
26,DAYS_REGISTRATION,78


# 4. Feature Engineering

* Create new features using business logic. We found above that **EXT_SOURCE** variables have high feature importance. Therefore, we create additional features that characterize the mean and standard deviation of each customers scores from 3 different data sources.
* We are curious to see how different ratios between credit to income, annuity to income, and annuity to credit can be created to be applied to the model.
* **DAYS_EMPLOYED_OF_BIRTH** can be a good metric to measure an applicant's competitiveness. If an applicant is young, but has been in the workforce for a long time, we can assume this person to be someone who is proactive and economically aware.
* **INCOME_BY_FAMILY_SIZE** can be a good metric to measure the spending capability of an individual. If an applicant has high income, but also has many family members to support, we can reasonably assume that this person will not have enough balance to pay back.

In [122]:
#create features related to external credit scores
df_combined['APPS_EXT_SOURCE_STD'] = df_combined[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
df_combined['APPS_EXT_SOURCE_MEAN'] = df_combined[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)

In [123]:
#create income, annuity, and credit ratios
df_combined['CREDIT_TO_INCOME'] = df_combined['AMT_CREDIT'] / df_combined['AMT_INCOME_TOTAL']
df_combined['ANNUITY_TO_INCOME'] = df_combined['AMT_ANNUITY'] / df_combined['AMT_INCOME_TOTAL']
df_combined['ANNUITY_TO_CREDIT'] = df_combined['AMT_ANNUITY'] / df_combined['AMT_CREDIT']

In [124]:
#create features that define an individual's economic capabilites
df_combined['DAYS_EMPLOYED_OF_BIRTH'] = df_combined['DAYS_EMPLOYED'] / df_combined['DAYS_BIRTH']
df_combined['INCOME_BY_FAMILY_SIZE'] = df_combined['AMT_INCOME_TOTAL'] / df_combined['CNT_FAM_MEMBERS']

# 5. 2nd Model Testing

* Train the model again using the new features extracted
* We observe that the new feature engineered column **ANNUITY_TO_CREDIT** has the highest feature importance in the new model. 
* We observe that the new feature engineered column **DAYS_EMPLOYED_OF_BIRTH** has 10th highest feature importance in the new model. 
* The validation AUC score has increased to 0.764, compared to 0.757 in the previous model.

In [125]:
#separate training and test dataframes
df_train = df_combined[~df_combined['TARGET'].isnull()]
df_test = df_combined[df_combined['TARGET'].isnull()]
#print shape
print(df_train.shape)
print(df_test.shape)

(307511, 129)
(48744, 129)


In [126]:
#divide training dataset
df_fit_x = df_train[[col for col in df_train.columns if col != 'TARGET']]
df_fit_y = df_train['TARGET']

In [127]:
#create training/validation dataset
x_train, x_valid, y_train, y_valid = train_test_split(df_fit_x, df_fit_y, test_size = 0.3, random_state = 1000)

In [128]:
#baseline classifier
clf = LGBMClassifier(n_estimators=1000)
#fit model
clf.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_valid, y_valid)], 
        eval_metric = 'auc', verbose = 100, early_stopping_rounds = 50)

[100]	training's auc: 0.811571	training's binary_logloss: 0.229979	valid_1's auc: 0.764275	valid_1's binary_logloss: 0.245715


LGBMClassifier(n_estimators=1000)

In [129]:
#observe feature importance
df_importance = pd.DataFrame(sorted(zip(x_train.columns, clf.feature_importances_)), columns = ['feature', 'importance']).sort_values(by='importance', ascending=False)
df_importance.head(10)

Unnamed: 0,feature,importance
10,ANNUITY_TO_CREDIT,325
27,DAYS_BIRTH,164
43,EXT_SOURCE_2,142
15,APPS_EXT_SOURCE_MEAN,142
16,APPS_EXT_SOURCE_STD,131
30,DAYS_ID_PUBLISH,125
44,EXT_SOURCE_3,118
0,AMT_ANNUITY,114
42,EXT_SOURCE_1,103
29,DAYS_EMPLOYED_OF_BIRTH,93


# 6. Feature Engineering - Previous Loans

* Extract features from previous loans for further model development
* Extracted the interest rates of previous loans by calculating extra fee that was incurred
* Calculated the count and ratio of the times each customer was rejected in previous loan application processes.

In [130]:
#load previous dataset
df_prev = pd.read_csv('data/previous_application.csv')
df_prev.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


In [131]:
#calculate previous loan interest rate
df_prev['TOTAL_PAID'] = df_prev['AMT_ANNUITY'] * df_prev['CNT_PAYMENT']
df_prev['PREV_INTERESTS'] = (df_prev['TOTAL_PAID'] - df_prev['AMT_CREDIT']) / df_prev['AMT_CREDIT']

In [132]:
#get previous loan summary
agg_dict = {
    'SK_ID_CURR': ['count'],
    'AMT_CREDIT': ['mean'],
    'AMT_ANNUITY': ['mean'],
    'AMT_APPLICATION': ['mean'],
    'AMT_DOWN_PAYMENT': ['mean'],
    'AMT_GOODS_PRICE': ['mean'],
    'PREV_INTERESTS': ['mean', 'min', 'max']
}
#group previous loans with respect to the current loan id
df_prev_grouped = df_prev.groupby(['SK_ID_CURR']).agg(agg_dict)
df_prev_grouped.columns = ["PREV_"+"_".join(x).upper() for x in df_prev_grouped.columns.ravel()]

In [133]:
#calculate previously refused cnt and ratio
df_prev_refused = df_prev[df_prev['NAME_CONTRACT_STATUS'] == 'Refused'].groupby(['SK_ID_CURR'])['SK_ID_PREV'].count().reset_index(name='PREV_REFUSED_CNT')
df_prev_grouped = df_prev_grouped.merge(df_prev_refused, on='SK_ID_CURR', how='left')
df_prev_grouped['PREV_REFUSED_RATIO'] = df_prev_grouped['PREV_REFUSED_CNT'] / df_prev_grouped['PREV_SK_ID_CURR_COUNT']

In [134]:
#combine with previous loan details
df_combined_prev = df_combined.merge(df_prev_grouped, on='SK_ID_CURR', how='left')
#separate training and test dataframes
df_train = df_combined_prev[~df_combined_prev['TARGET'].isnull()]
df_test = df_combined_prev[df_combined_prev['TARGET'].isnull()]
#print shape
print(df_train.shape)
print(df_test.shape)

(307511, 140)
(48744, 140)


# 7. 3rd Model Testing

* Train a 3rd model that takes previous loan histories and the current loan application information altogether
* **ANNUITY_TO_CREDIT** still has the highest feature importance in the new model
* We also observe that newly feature engineered column **PREV_PREV_INTERESTS_MAX**, which measures an applican't maximum interest rate in previous loan applications, has the 3rd highest feature importance in the new model.
* The new model's validation AUC score has increased to 0.769, compared to 0.764 in the 2nd model testing

In [135]:
#divide training dataset
df_fit_x = df_train[[col for col in df_train.columns if col != 'TARGET']]
df_fit_y = df_train['TARGET']

In [136]:
#create training/validation dataset
x_train, x_valid, y_train, y_valid = train_test_split(df_fit_x, df_fit_y, test_size = 0.3, random_state = 1000)

In [137]:
#baseline classifier
clf = LGBMClassifier(n_estimators=1000)
#fit model
clf.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_valid, y_valid)], 
        eval_metric = 'auc', verbose = 100, early_stopping_rounds = 50)

[100]	training's auc: 0.820005	training's binary_logloss: 0.226688	valid_1's auc: 0.7691	valid_1's binary_logloss: 0.244101


LGBMClassifier(n_estimators=1000)

In [138]:
#observe feature importance
df_importance = pd.DataFrame(sorted(zip(x_train.columns, clf.feature_importances_)), columns = ['feature', 'importance']).sort_values(by='importance', ascending=False)
df_importance.head(10)

Unnamed: 0,feature,importance
10,ANNUITY_TO_CREDIT,249
27,DAYS_BIRTH,176
116,PREV_PREV_INTERESTS_MAX,160
43,EXT_SOURCE_2,148
44,EXT_SOURCE_3,148
16,APPS_EXT_SOURCE_STD,136
15,APPS_EXT_SOURCE_MEAN,123
30,DAYS_ID_PUBLISH,115
0,AMT_ANNUITY,112
114,PREV_AMT_DOWN_PAYMENT_MEAN,103


# 8. Summary

* So far, we have built 3 models
    * 1st Baseline Model without any feature engineering
    * 2nd Model - feature engineering columns only in **applications** dataset
    * 3rd Model - feature engineering columns in **applications** and **prev_applications** datasets and combining the two datasets together
* We have witnessed the validation AUC scores increasing from 0.757 -> 0.764 -> 0.769
* Although there was no dramatic increase, we can observe that as we add new features into the model, the performance seesm to improve little by little
* What we haven't done are hyperparameter tunings and considering other datasets included in the Kaggle's project such as user's bureau history and credit card balance dataset
* For future analysis, we can investigate more into the above to better improve our model.