In [106]:
import pandas as pd
import category_encoders as ce
import numpy as np

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score

In [107]:
N_ROWS=10000

In [108]:
df=pd.read_csv('data/application_train.csv', nrows=N_ROWS)

In [109]:
df.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
X=df.loc[:, ~df.columns.isin(['SK_ID_CURR', 'TARGET'])]
y=df['TARGET'].astype(int)

In [111]:
# using the train test split function
X_train, X_test,y_train, y_test = train_test_split(X,y,random_state=104,test_size=0.25,shuffle=True)

In [112]:
categorical_feature_mask = X.dtypes==object
categorical_features = X.columns[categorical_feature_mask].tolist()

In [113]:
numeric_feature_mask = X.dtypes!=object
numeric_features = X.columns[numeric_feature_mask].tolist()

In [114]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target', ce.TargetEncoder()),
#     ('onehot', OneHotEncoder(handle_unknown='ignore')),
])



In [115]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

In [116]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [117]:
pipe = Pipeline([
     ('preprocessor', preprocessor)
])

In [118]:
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)

In [120]:
N_ITER=100

estimator = XGBClassifier(eval_metric='auc', use_label_encoder=False)

fit_params = {
    'early_stopping_rounds': 10,
    'eval_set':[(X_test, y_test)],
    'verbose': False,
}

ratio = float(np.sum(y_train == 0)) / np.sum(y_train==1)

search_space = {
    'max_depth': (1, 6),
    'n_estimators': (50, 500),
    'min_child_weight': (1, 100),
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'scale_pos_weight' : (0, ratio),
}

opt = BayesSearchCV(
    estimator=estimator,
    search_spaces=search_space,
    fit_params=fit_params,
    cv=cv,
    scoring="roc_auc",
    random_state=42,
    n_iter=N_ITER,
    verbose=1,
)

In [121]:
X_train_transformed=pipe.fit_transform(X_train, y_train)
X_test_transformed=pipe.transform(X_test)



In [None]:
opt.fit(X_train_transformed, y_train)

Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fits
Fitting 9 folds for each of 1 candidates, totalling 9 fi

In [None]:
pd.DataFrame(opt.cv_results_)

In [None]:
opt.best_params_

In [None]:
opt.best_estimator_

In [None]:
roc_auc_score(y_test, opt.predict(X_test_transformed))

In [None]:
# scoring on test dataset
test_df=pd.read_csv('data/application_test.csv')
test_df_=test_df.loc[:, ~test_df.columns.isin(['SK_ID_CURR'])]
test_df_transformed=pipe.transform(test_df_)

submission_df=test_df[['SK_ID_CURR']]
submission_df['TARGET']=opt.predict(test_df_transformed)
submission_df.to_csv('submission.csv', index=False)