In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
%matplotlib inline

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn import cross_validation
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from xgboost import XGBClassifier
from sklearn.grid_search import GridSearchCV
#import xgboost as xgb
from sklearn.metrics import mean_squared_error
import lightgbm as lgb



# Data Loading

In [5]:
application_train = pd.read_csv('application_train.csv')
application_test = pd.read_csv('application_test.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
bureau = pd.read_csv('bureau.csv')
credit_card_balance = pd.read_csv('credit_card_balance.csv')
installments_payments = pd.read_csv('installments_payments.csv')
POS_CASH_balance = pd.read_csv('POS_CASH_balance.csv')
previous_application = pd.read_csv('previous_application.csv')


In [6]:
print(application_train.shape)
application_train.head(3)

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
X_train = application_train.drop(['TARGET'],axis =1)
Y_train = application_train['TARGET']

In [8]:
Y_train.value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [7]:
# Finding Features with missing values

In [9]:
pct_null = X_train.isnull().sum() / len(X_train)
missing_features = pct_null[pct_null > 0.65].index
print(len(missing_features))
missing_features

17


Index([u'OWN_CAR_AGE', u'YEARS_BUILD_AVG', u'COMMONAREA_AVG', u'FLOORSMIN_AVG',
       u'LIVINGAPARTMENTS_AVG', u'NONLIVINGAPARTMENTS_AVG',
       u'YEARS_BUILD_MODE', u'COMMONAREA_MODE', u'FLOORSMIN_MODE',
       u'LIVINGAPARTMENTS_MODE', u'NONLIVINGAPARTMENTS_MODE',
       u'YEARS_BUILD_MEDI', u'COMMONAREA_MEDI', u'FLOORSMIN_MEDI',
       u'LIVINGAPARTMENTS_MEDI', u'NONLIVINGAPARTMENTS_MEDI',
       u'FONDKAPREMONT_MODE'],
      dtype='object')

In [10]:
X_test = application_test
print(application_test.shape)
application_test.head(1)

(48744, 121)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
pct_null_test = X_test.isnull().sum() / len(X_test)
missing_features_test = pct_null_test[pct_null_test > 0.65].index
print(len(missing_features_test))
missing_features_test

17


Index([u'OWN_CAR_AGE', u'YEARS_BUILD_AVG', u'COMMONAREA_AVG', u'FLOORSMIN_AVG',
       u'LIVINGAPARTMENTS_AVG', u'NONLIVINGAPARTMENTS_AVG',
       u'YEARS_BUILD_MODE', u'COMMONAREA_MODE', u'FLOORSMIN_MODE',
       u'LIVINGAPARTMENTS_MODE', u'NONLIVINGAPARTMENTS_MODE',
       u'YEARS_BUILD_MEDI', u'COMMONAREA_MEDI', u'FLOORSMIN_MEDI',
       u'LIVINGAPARTMENTS_MEDI', u'NONLIVINGAPARTMENTS_MEDI',
       u'FONDKAPREMONT_MODE'],
      dtype='object')

In [12]:
categorical_features = X_train.select_dtypes(include=[object]).columns
categorical_features

Index([u'NAME_CONTRACT_TYPE', u'CODE_GENDER', u'FLAG_OWN_CAR',
       u'FLAG_OWN_REALTY', u'NAME_TYPE_SUITE', u'NAME_INCOME_TYPE',
       u'NAME_EDUCATION_TYPE', u'NAME_FAMILY_STATUS', u'NAME_HOUSING_TYPE',
       u'OCCUPATION_TYPE', u'WEEKDAY_APPR_PROCESS_START', u'ORGANIZATION_TYPE',
       u'FONDKAPREMONT_MODE', u'HOUSETYPE_MODE', u'WALLSMATERIAL_MODE',
       u'EMERGENCYSTATE_MODE'],
      dtype='object')

In [13]:
#merging Test and Train to remove the missing values and label encode the catgorical features

train_test_merge = pd.concat([X_train,X_test])
train_test_merge.shape

(356255, 121)

In [14]:
train_test_merge_categorical = train_test_merge[categorical_features].replace(np.NAN, '-1')

In [15]:
train_test_merge_cat_encode = train_test_merge_categorical[categorical_features].apply(LabelEncoder().fit_transform)

In [16]:
train_test_merge_noncategorical = train_test_merge.drop(categorical_features,axis =1).replace(np.NAN, -1)

In [17]:
train_test_merge_final = pd.concat([train_test_merge_cat_encode,train_test_merge_noncategorical],axis = 1)

In [18]:
pct_null = train_test_merge_cat_encode.isnull().sum() / len(train_test_merge_cat_encode)
missing_features = pct_null[pct_null > 0].index
print(len(missing_features))
missing_features

0


Index([], dtype='object')

In [19]:
X_train_final = train_test_merge_final.iloc[:307511]
X_test_final = train_test_merge_final[307511:356255]

# Train-Test Split

In [20]:

x_train,x_val,y_train,y_val = cross_validation.train_test_split(X_train_final,Y_train,test_size = 0.3,random_state = 0,stratify=Y_train)

# Fitting Models with only Application Data-set

# Random Forest

In [23]:
classifier_RF = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)

In [26]:
classifier_RF.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [27]:
y_train_RF = classifier_RF.predict_proba(x_train)
y_val_RF = classifier_RF.predict_proba(x_val)

In [28]:
y_train_RF = pd.DataFrame(y_train_RF).iloc[:,1]
y_val_RF = pd.DataFrame(y_val_RF).iloc[:,1]


In [29]:
print("ROC for Train", roc_auc_score(y_train,y_train_RF))
print("ROC for Validation",roc_auc_score(y_val,y_val_RF))

ROC for Train 1.0
ROC for Validation 0.71925738869


# Light GBM

In [23]:
from lightgbm import LGBMClassifier
clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

In [24]:
clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 100)

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.754988	valid_1's auc: 0.741619
[200]	training's auc: 0.771379	valid_1's auc: 0.751849
[300]	training's auc: 0.781317	valid_1's auc: 0.756424
[400]	training's auc: 0.789706	valid_1's auc: 0.7587
[500]	training's auc: 0.796689	valid_1's auc: 0.759574
[600]	training's auc: 0.802898	valid_1's auc: 0.76002
[700]	training's auc: 0.808844	valid_1's auc: 0.760363
[800]	training's auc: 0.814203	valid_1's auc: 0.760398
[900]	training's auc: 0.819507	valid_1's auc: 0.760414
Early stopping, best iteration is:
[837]	training's auc: 0.816167	valid_1's auc: 0.760481


LGBMClassifier(boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9497036, learning_rate=0.02, max_depth=8,
        min_child_samples=20, min_child_weight=39.3259775,
        min_split_gain=0.0222415, n_estimators=10000, n_jobs=-1, nthread=4,
        num_leaves=34, objective=None, random_state=None,
        reg_alpha=0.041545473, reg_lambda=0.0735294, silent=-1,
        subsample=0.8715623, subsample_for_bin=200000, subsample_freq=0,
        verbose=-1)

In [25]:
lgbm_params = {
    'boosting': 'dart',
    'application': 'binary',
    'learning_rate': 0.1,
    'min_data_in_leaf': 30,
    'num_leaves': 31,
    'max_depth': -1,
    'feature_fraction': 0.5,
    'scale_pos_weight': 2,
    'drop_rate': 0.02
}

In [28]:
x_train_n =lgb.Dataset(x_train,label=y_train)
cv_results = lgb.cv(train_set=x_train_n,
                     params=lgbm_params,
                     nfold=5,
                     num_boost_round=600,
                     early_stopping_rounds=50,
                     verbose_eval=50,
                     metrics=['auc'])


[50]	cv_agg's auc: 0.751011 + 0.00651481
[100]	cv_agg's auc: 0.753719 + 0.00640304
[150]	cv_agg's auc: 0.753969 + 0.00622002
[200]	cv_agg's auc: 0.755065 + 0.00601078
[250]	cv_agg's auc: 0.755155 + 0.00614526
[300]	cv_agg's auc: 0.755678 + 0.00605166


In [31]:
optimum_boost_rounds = np.argmax(cv_results['auc-mean'])
print('Optimum boost rounds = {}'.format(optimum_boost_rounds))
print('Best CV result = {}'.format(np.max(cv_results['auc-mean'])))

clf = lgb.train(train_set=x_train_n,
                 params=lgbm_params,
                 num_boost_round=optimum_boost_rounds)

Optimum boost rounds = 284
Best CV result = 0.755733193988


In [31]:
x_train_n =lgb.Dataset(x_train,label=y_train)
lgbm_n=lgb.train(param,x_train_n,num_round)

In [32]:
y_train_LGBM =lgbm_n.predict(x_train)
y_val_LGBM =lgbm_n.predict(x_val)

In [33]:
print("ROC for Train", roc_auc_score(y_train,y_train_LGBM))
print("ROC for Validation",roc_auc_score(y_val,y_val_LGBM))

ROC for Train 0.904038469572
ROC for Validation 0.754223866043


In [36]:
y_test_pred = lgbm_n.predict(X_test_final)
pd.DataFrame(y_test_pred).to_csv('Home_Pred.csv')

# Logistic Regression

In [37]:
classifier_LR = LogisticRegression(penalty='l2',random_state = 0)
classifier_LR.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
y_train_LR = classifier_LR.predict_proba(x_train)
y_val_LR= classifier_LR.predict_proba(x_val)

In [39]:
y_train_LR = pd.DataFrame(y_train_LR).iloc[:,1]
y_val_LR= pd.DataFrame(y_val_LR).iloc[:,1]

In [40]:
print("ROC for Train", roc_auc_score(y_train,y_train_LR))
print("ROC for Validation",roc_auc_score(y_val,y_val_LR))

ROC for Train 0.621804328972
ROC for Validation 0.61999549968


# Supplementary Data Considerations

# Bureau

In [82]:
bureau_final = bureau_final.drop('SK_ID_BUREAU', axis =1)

In [None]:
# Join Application with Bureau

# Bureau-Balance

In [52]:
bureau_balance['STATUS'].value_counts()

C    13646993
0     7499507
X     5810482
1      242347
5       62406
2       23419
3        8924
4        5847
Name: STATUS, dtype: int64

In [102]:
bureau_unstacked = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'].value_counts().unstack('STATUS')


In [103]:
bureau_unstacked.columns = ['STATUS_0', 'STATUS_1','STATUS_2','STATUS_3','STATUS_4','STATUS_5','STATUS_C','STATUS_X',]
bureau_unstacked['MONTHS_COUNT'] = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].size()
bureau_unstacked['MONTHS_MIN'] = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].max()
bureau_unstacked['MONTHS_MAX'] = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].max()

In [109]:
bureau_unstacked.head(1)

Unnamed: 0_level_0,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X,MONTHS_COUNT,MONTHS_MIN,MONTHS_MAX
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5001709,,,,,,,86.0,11.0,97,0,0


# Bureau

In [None]:
bureau = bureau.merge(bureau_unstacked, how = 'left', on = 'SK_ID_BUREAU')

In [None]:
avg_bureau = bureau.groupby('SK_ID_CURR').mean()

In [None]:
bureau_credit_type = bureau.groupby('SK_ID_CURR')['CREDIT_ACTIVE'].value_counts().unstack('CREDIT_ACTIVE').replace(np.NAN,0)

In [None]:
bureau_final = pd.concat([avg_buro,bureau_credit_type],axis =1)

In [None]:
bureau_final['Bureau_No'] = bureau.groupby('SK_ID_CURR').size()

In [None]:
bureau_final = bureau_final.drop('SK_ID_BUREAU', axis =1)