In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [154]:
full_train_orig = pd.read_csv("./Data/application_train.csv")
test =  pd.read_csv("./Data/application_test.csv")

In [159]:
'''
Removing columns with more than 100 null values (filters out 62 columns out of 122 columns)
And not useful (based on EDA) which removes reduces feature size to 43)
'''

null_columns = full_train_orig.columns[full_train_orig.isnull().sum().values > 100].values.tolist()
useless_columns = ["FLAG_DOCUMENT_2", "FLAG_DOCUMENT_4", "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_7"
                  ,'FLAG_DOCUMENT_9','FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
       'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

In [160]:
### Manually removing columns which doesn't make sense (based on EDA)
full_train = full_train_orig.drop(null_columns+useless_columns, axis = 1)

In [171]:
#cat_columns_all = set(full_train.columns) - set(full_train._get_numeric_data().columns.tolist())

In [172]:
# cat_feats = ["CODE_GENDER", "NAME_CONTRACT_TYPE", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", 
#              "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "OCCUPATION_TYPE",
#              "NAME_HOUSING_TYPE", "FLAG_DOCUMENT_3"]
# num_feats = ["AMT_INCOME_TOTAL", "CNT_CHILDREN", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", "DAYS_BIRTH",
#              "DAYS_EMPLOYED", "DAYS_REGISTRATION", "DAYS_LAST_PHONE_CHANGE", "AMT_REQ_CREDIT_BUREAU_YEAR", 
#             "REGION_POPULATION_RELATIVE", ]

In [173]:
full_train_y = full_train.TARGET.values
full_train = full_train.drop(["TARGET"], axis = 1)
full_train = full_train.set_index("SK_ID_CURR")
num_feats = full_train._get_numeric_data().columns.values.tolist()
cat_feats = list(set(full_train.columns.values) - set(num_feats))

In [174]:
## Categorical Features - Train
train_cat= full_train[cat_feats]
train_cat = pd.get_dummies(train_cat)

## Numerical Features - Train
train_num = full_train[num_feats]
# train[num_feats].isnull().any() # Checking if the column has any null value
#train_num = train_num.fillna((train_num.mean()), inplace=True)

## Categorical Features - Test
test_cat = test[cat_feats]
test_cat = pd.get_dummies(test_cat)

## Numerical Features - Test
test_num = test[num_feats]
test_num = test_num.fillna(test_num.mean())

# ## Standardize numerical features
# std = StandardScaler().fit(train_num)
# train_num = std.transform(train_num)
# test_num = std.transform(test_num)


In [175]:
full_train_feats = pd.concat([train_num, train_cat], axis=1)
test_feats = pd.concat([test_num, test_cat], axis=1)

In [176]:
full_train_feats = full_train_feats.apply(lambda x: x.fillna(x.mean()),axis=0)
test_feats = test_feats.apply(lambda x: x.fillna(x.mean()),axis=0)

In [177]:
train_X, valid_X, train_y, valid_y = train_test_split(full_train_feats, full_train_y, train_size = 0.8)

In [180]:
### RF classifier
params_rf={
    'max_depth': [20, 40, 60], #[3,4,5,6,7,8,9], # 5 is good but takes too long in kaggle env
    'n_estimators': [100, 300, 500], #[1000,2000,3000]
}

rf_clf = RandomForestClassifier(missing=9999999999)
rf = GridSearchCV(rf_clf,
                  params_rf,
                  cv=3,
                  scoring="roc_auc",
                  n_jobs=1,
                  verbose=2)
rf.fit(train_X.drop("CODE_GENDER_XNA", axis=1), train_y)
best_est_rf = rf.best_estimator_
print(best_est)
valid_probs_rf = rf.predict_proba(valid_X)[:,1]
valid_preds_rf = rf.predict(valid_X)

In [181]:
print(accuracy_score(valid_y, valid_preds_rf))
print(roc_auc_score(valid_y, valid_probs_rf))

0.918115102356
0.672319855992


In [200]:
params={
    'max_depth': [3, 5], #[3,4,5,6,7,8,9], # 5 is good but takes too long in kaggle env
    'subsample': [0.6, 0.8], #[0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    'colsample_bytree': [0.5, 0.7], #[0.5,0.6,0.7,0.8],
    'n_estimators': [300, 500], #[1000,2000,3000]
    'reg_alpha': [0.01, 0.05] #[0.01, 0.02, 0.03, 0.04]
}

xgb_clf = xgb.XGBClassifier(missing=9999999999)
rs = GridSearchCV(xgb_clf,
                  params,
                  cv=3,
                  scoring="roc_auc",
                  n_jobs=1,
                  verbose=2)
rs.fit(train_X.drop("CODE_GENDER_XNA", axis=1), train_y)
best_est = rs.best_estimator_
print(best_est)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] reg_alpha=0.01, subsample=0.6, max_depth=3, n_estimators=300, colsample_bytree=0.5 
[CV]  reg_alpha=0.01, subsample=0.6, max_depth=3, n_estimators=300, colsample_bytree=0.5, total=   5.5s
[CV] reg_alpha=0.01, subsample=0.6, max_depth=3, n_estimators=300, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.6s remaining:    0.0s


[CV]  reg_alpha=0.01, subsample=0.6, max_depth=3, n_estimators=300, colsample_bytree=0.5, total=   6.7s
[CV] reg_alpha=0.01, subsample=0.6, max_depth=3, n_estimators=300, colsample_bytree=0.5 
[CV]  reg_alpha=0.01, subsample=0.6, max_depth=3, n_estimators=300, colsample_bytree=0.5, total=   5.6s
[CV] reg_alpha=0.01, subsample=0.8, max_depth=3, n_estimators=300, colsample_bytree=0.5 
[CV]  reg_alpha=0.01, subsample=0.8, max_depth=3, n_estimators=300, colsample_bytree=0.5, total=   4.9s
[CV] reg_alpha=0.01, subsample=0.8, max_depth=3, n_estimators=300, colsample_bytree=0.5 
[CV]  reg_alpha=0.01, subsample=0.8, max_depth=3, n_estimators=300, colsample_bytree=0.5, total=   4.8s
[CV] reg_alpha=0.01, subsample=0.8, max_depth=3, n_estimators=300, colsample_bytree=0.5 
[CV]  reg_alpha=0.01, subsample=0.8, max_depth=3, n_estimators=300, colsample_bytree=0.5, total=   5.6s
[CV] reg_alpha=0.05, subsample=0.6, max_depth=3, n_estimators=300, colsample_bytree=0.5 
[CV]  reg_alpha=0.05, subsample=0.6

[CV]  reg_alpha=0.05, subsample=0.6, max_depth=5, n_estimators=500, colsample_bytree=0.5, total=  17.8s
[CV] reg_alpha=0.05, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.5 
[CV]  reg_alpha=0.05, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.5, total=  16.9s
[CV] reg_alpha=0.05, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.5 
[CV]  reg_alpha=0.05, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.5, total=  17.0s
[CV] reg_alpha=0.05, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.5 
[CV]  reg_alpha=0.05, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.5, total=  16.4s
[CV] reg_alpha=0.01, subsample=0.6, max_depth=3, n_estimators=300, colsample_bytree=0.7 
[CV]  reg_alpha=0.01, subsample=0.6, max_depth=3, n_estimators=300, colsample_bytree=0.7, total=   8.5s
[CV] reg_alpha=0.01, subsample=0.6, max_depth=3, n_estimators=300, colsample_bytree=0.7 
[CV]  reg_alpha=0.01, subsample=0.6

[CV]  reg_alpha=0.01, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.7, total=  21.9s
[CV] reg_alpha=0.01, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.7 
[CV]  reg_alpha=0.01, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.7, total=  25.5s
[CV] reg_alpha=0.01, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.7 
[CV]  reg_alpha=0.01, subsample=0.8, max_depth=5, n_estimators=500, colsample_bytree=0.7, total=  21.9s
[CV] reg_alpha=0.05, subsample=0.6, max_depth=5, n_estimators=500, colsample_bytree=0.7 
[CV]  reg_alpha=0.05, subsample=0.6, max_depth=5, n_estimators=500, colsample_bytree=0.7, total=  21.3s
[CV] reg_alpha=0.05, subsample=0.6, max_depth=5, n_estimators=500, colsample_bytree=0.7 
[CV]  reg_alpha=0.05, subsample=0.6, max_depth=5, n_estimators=500, colsample_bytree=0.7, total=  21.5s
[CV] reg_alpha=0.05, subsample=0.6, max_depth=5, n_estimators=500, colsample_bytree=0.7 
[CV]  reg_alpha=0.05, subsample=0.6

[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 20.1min finished


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=9999999999, n_estimators=300,
       nthread=-1, objective='binary:logistic', reg_alpha=0.01,
       reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.8)


In [183]:
# xgb_single = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
#        gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
#        min_child_weight=1, missing=9999999999, n_estimators=300,
#        nthread=-1, objective='binary:logistic', reg_alpha=0.05,
#        reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
#        subsample=0.8)

# xgb_single.fit(train_X, train_y)
# valid_probs_xgb_single = xgb_single.predict_proba(valid_X)[:,1]
# valid_preds_xgb_single = xgb_single.predict(valid_X)
# print(accuracy_score(valid_y, valid_preds_xgb_single))
# print(roc_auc_score(valid_y, valid_probs_xgb_single))

In [203]:
valid_probs_rs = rs.predict_proba(valid_X.drop("CODE_GENDER_XNA", axis=1))[:,1]
valid_preds_rs= rs.predict(valid_X.drop("CODE_GENDER_XNA", axis=1))
print(accuracy_score(valid_y, valid_preds_rs))
print(roc_auc_score(valid_y, valid_probs_rs))

0.917986352517
0.709861010787


In [204]:
### Prepare submission file and save to disk
result_df = pd.DataFrame({'SK_ID_CURR':test.SK_ID_CURR.values, "TARGET":rs.predict_proba(test_feats)[:,1]})
result_df.to_csv("submimssion_2_xgboost_gridsearch_allfeatures.csv", index=False)