In [93]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [94]:
full_train = pd.read_csv("./Data/application_train.csv")
test =  pd.read_csv("./Data/application_test.csv")

In [189]:
full_train.columns.values

array(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL',
       'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE',
       'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS',
       'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
       'ORGANIZATION_TYPE', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
      

In [190]:
full_train.iloc[0]

SK_ID_CURR                                           100002
TARGET                                                    1
NAME_CONTRACT_TYPE                               Cash loans
CODE_GENDER                                               M
FLAG_OWN_CAR                                              N
FLAG_OWN_REALTY                                           Y
CNT_CHILDREN                                              0
AMT_INCOME_TOTAL                                     202500
AMT_CREDIT                                           406598
AMT_ANNUITY                                         24700.5
AMT_GOODS_PRICE                                      351000
NAME_TYPE_SUITE                               Unaccompanied
NAME_INCOME_TYPE                                    Working
NAME_EDUCATION_TYPE           Secondary / secondary special
NAME_FAMILY_STATUS                     Single / not married
NAME_HOUSING_TYPE                         House / apartment
REGION_POPULATION_RELATIVE              

In [300]:
full_train[full_train.TARGET == 0]["CNT_FAM_MEMBERS"].value_counts()

2.0     18599
1.0      7716
3.0      6102
4.0      2805
5.0       420
6.0        45
7.0         9
8.0         4
9.0         2
10.0        1
Name: CNT_FAM_MEMBERS, dtype: int64

In [301]:
full_train[full_train.TARGET == 1]["CNT_FAM_MEMBERS"].value_counts()

2.0     1478
1.0      728
3.0      590
4.0      279
5.0       39
6.0       11
7.0        2
13.0       1
10.0       1
Name: CNT_FAM_MEMBERS, dtype: int64

In [97]:
bureau.shape

(1716428, 17)

In [285]:
cat_feats = ["NAME_CONTRACT_TYPE", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "OCCUPATION_TYPE", "FLAG_DOCUMENT_3"]
num_feats = ["AMT_INCOME_TOTAL", "CNT_CHILDREN", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", "DAYS_BIRTH",
             "DAYS_EMPLOYED", "DAYS_REGISTRATION", "DAYS_LAST_PHONE_CHANGE"]

In [286]:
## Categorical Features - Train
train_cat= full_train[cat_feats]
train_cat = pd.get_dummies(train_cat)

## Numerical Features - Train
train_num = full_train[num_feats]
# train[num_feats].isnull().any() # Checking if the column has any null value
#train_num = train_num.fillna((train_num.mean()), inplace=True)

## Categorical Features - Test
test_cat = test[cat_feats]
test_cat = pd.get_dummies(test_cat)

## Numerical Features - Test
test_num = test[num_feats]
test_num = test_num.fillna(test_num.mean())

# ## Standardize numerical features
# std = StandardScaler().fit(train_num)
# train_num = std.transform(train_num)
# test_num = std.transform(test_num)


In [287]:
full_train_feats = pd.concat([train_num, train_cat], axis=1)
test_feats = pd.concat([test_num, test_cat], axis=1)

In [288]:
full_train_feats = full_train_feats.apply(lambda x: x.fillna(x.mean()),axis=0)
test_feats = test_feats.apply(lambda x: x.fillna(x.mean()),axis=0)
full_train_y = full_train.TARGET

In [289]:
train_X, valid_X, train_y, valid_y = train_test_split(full_train_feats, full_train_y, train_size = 0.8)

In [292]:
### RF classifier
rf = RandomForestClassifier(n_estimators=300, n_jobs=-1)
rf.fit(train_X, train_y)
valid_probs_rf = rf.predict_proba(valid_X)[:,1]
valid_preds_rf = rf.predict(valid_X)

In [293]:
print(accuracy_score(valid_y, valid_preds_rf))
print(roc_auc_score(valid_y, valid_probs_rf))

0.917986352517
0.608047562536


In [230]:
params={
    'max_depth': [3, 5], #[3,4,5,6,7,8,9], # 5 is good but takes too long in kaggle env
    'subsample': [0.6, 0.8], #[0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    'colsample_bytree': [0.5, 0.7], #[0.5,0.6,0.7,0.8],
    'n_estimators': [300, 500], #[1000,2000,3000]
    'reg_alpha': [0.01, 0.05] #[0.01, 0.02, 0.03, 0.04]
}

xgb_clf = xgb.XGBClassifier(missing=9999999999)
rs = GridSearchCV(xgb_clf,
                  params,
                  cv=3,
                  scoring="roc_auc",
                  n_jobs=1,
                  verbose=2)
rs.fit(train_X, train_y)
best_est = rs.best_estimator_
print(best_est)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.6, reg_alpha=0.01 
[CV]  n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.6, reg_alpha=0.01, total=   2.2s
[CV] n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.6, reg_alpha=0.01 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s


[CV]  n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.6, reg_alpha=0.01, total=   2.0s
[CV] n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.6, reg_alpha=0.01 
[CV]  n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.6, reg_alpha=0.01, total=   2.2s
[CV] n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.01 
[CV]  n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.01, total=   4.9s
[CV] n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.01 
[CV]  n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.01, total=   5.5s
[CV] n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.01 
[CV]  n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.01, total=   3.1s
[CV] n_estimators=300, max_depth=3, colsample_bytree=0.5, subsample=0.6, reg_alpha=0.05 
[CV]  n_estimators=300, max_depth=3

[CV]  n_estimators=500, max_depth=5, colsample_bytree=0.5, subsample=0.6, reg_alpha=0.05, total=   7.5s
[CV] n_estimators=500, max_depth=5, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.05 
[CV]  n_estimators=500, max_depth=5, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.05, total=   8.9s
[CV] n_estimators=500, max_depth=5, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.05 
[CV]  n_estimators=500, max_depth=5, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.05, total=   8.4s
[CV] n_estimators=500, max_depth=5, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.05 
[CV]  n_estimators=500, max_depth=5, colsample_bytree=0.5, subsample=0.8, reg_alpha=0.05, total=   9.0s
[CV] n_estimators=300, max_depth=3, colsample_bytree=0.7, subsample=0.6, reg_alpha=0.01 
[CV]  n_estimators=300, max_depth=3, colsample_bytree=0.7, subsample=0.6, reg_alpha=0.01, total=   3.8s
[CV] n_estimators=300, max_depth=3, colsample_bytree=0.7, subsample=0.6, reg_alpha=0.01 
[CV]  n_estimators=300, max_depth=3

[CV]  n_estimators=500, max_depth=5, colsample_bytree=0.7, subsample=0.8, reg_alpha=0.01, total=  10.2s
[CV] n_estimators=500, max_depth=5, colsample_bytree=0.7, subsample=0.8, reg_alpha=0.01 
[CV]  n_estimators=500, max_depth=5, colsample_bytree=0.7, subsample=0.8, reg_alpha=0.01, total=  10.7s
[CV] n_estimators=500, max_depth=5, colsample_bytree=0.7, subsample=0.8, reg_alpha=0.01 
[CV]  n_estimators=500, max_depth=5, colsample_bytree=0.7, subsample=0.8, reg_alpha=0.01, total=  10.3s
[CV] n_estimators=500, max_depth=5, colsample_bytree=0.7, subsample=0.6, reg_alpha=0.05 
[CV]  n_estimators=500, max_depth=5, colsample_bytree=0.7, subsample=0.6, reg_alpha=0.05, total=  10.1s
[CV] n_estimators=500, max_depth=5, colsample_bytree=0.7, subsample=0.6, reg_alpha=0.05 
[CV]  n_estimators=500, max_depth=5, colsample_bytree=0.7, subsample=0.6, reg_alpha=0.05, total=   9.7s
[CV] n_estimators=500, max_depth=5, colsample_bytree=0.7, subsample=0.6, reg_alpha=0.05 
[CV]  n_estimators=500, max_depth=5

[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 10.2min finished


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=9999999999, n_estimators=300,
       nthread=-1, objective='binary:logistic', reg_alpha=0.05,
       reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.8)


In [244]:
xgb_single = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=9999999999, n_estimators=300,
       nthread=-1, objective='binary:logistic', reg_alpha=0.05,
       reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.8)

xgb_single.fit(train_X, train_y)
valid_probs_xgb_single = xgb_single.predict_proba(valid_X)[:,1]
valid_preds_xgb_single = xgb_single.predict(valid_X)
print(accuracy_score(valid_y, valid_preds_xgb_single))
print(roc_auc_score(valid_y, valid_probs_xgb_single))

0.919273850908
0.671098200046


In [231]:
valid_probs_rs = rs.predict_proba(valid_X)[:,1]
valid_preds_rs= rs.predict(valid_X)
print(accuracy_score(valid_y, valid_preds_rs))
print(roc_auc_score(valid_y, valid_probs_rs))

0.919273850908
0.671098200046


In [232]:
### Prepare submission file and save to disk
result_df = pd.DataFrame({'SK_ID_CURR':test.SK_ID_CURR.values, "TARGET":rs.predict_proba(test_feats)[:,1]})
result_df.to_csv("submimssion_2_xgboost_gridsearch_added_features.csv", index=False)