In [48]:
import pandas as pd
import category_encoders as ce
import numpy as np

from xgboost import XGBClassifier

from feature_engine.selection import (DropFeatures, DropConstantFeatures, 
                                      DropDuplicateFeatures)

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score

In [49]:
N_ROWS=100000

In [50]:
df=pd.read_csv('data/application_train.csv', nrows=N_ROWS)

In [51]:
df.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
X=df.loc[:, ~df.columns.isin(['TARGET'])]
y=df['TARGET'].astype(int)

In [53]:
# using the train test split function
X_train, X_test,y_train, y_test = train_test_split(X,y,random_state=104,test_size=0.25,shuffle=True)

In [54]:
categorical_feature_mask = X.dtypes==object
categorical_features = X.columns[categorical_feature_mask].tolist()

In [55]:
numeric_feature_mask = X.dtypes!=object
numeric_features = X.columns[numeric_feature_mask].tolist()

In [56]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target', ce.TargetEncoder()),
#     ('onehot', OneHotEncoder(handle_unknown='ignore')),
])



In [57]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('log', FunctionTransformer(np.log1p)),
    ('imputer', SimpleImputer(strategy='median')),
])

In [58]:
feature_engine = Pipeline(steps=[
    # Step 1: Drop irrelevant columns/features
    ('drop_columns', DropFeatures(['SK_ID_CURR'])),
    ('drop_constant_values', DropConstantFeatures(tol=1, missing_values='ignore')),
    ('drop_duplicates', DropDuplicateFeatures())
])

In [59]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [60]:
pipe = Pipeline([
    ('preprocessor', preprocessor)
])

In [61]:
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)

In [62]:
N_ITER=10

estimator = XGBClassifier(eval_metric='auc', use_label_encoder=False)

fit_params = {
    'early_stopping_rounds': 10,
    'eval_set':[(X_test, y_test)],
    'verbose': False,
}

ratio = float(np.sum(y_train == 0)) / np.sum(y_train==1)

search_space = {
    'max_depth': (1, 6),
    'n_estimators': (50, 500),
    'min_child_weight': (1, 100),
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'scale_pos_weight' : (0, ratio),
}

opt = BayesSearchCV(
    estimator=estimator,
    search_spaces=search_space,
    fit_params=fit_params,
    cv=cv,
    scoring="roc_auc",
    random_state=42,
    n_iter=N_ITER,
    verbose=1,
    return_train_score=True,
)

In [63]:
X_train_transformed=pipe.fit_transform(X_train, y_train)
X_test_transformed=pipe.transform(X_test)

print(X_train.shape)
print(X_train_transformed.shape)

  return func(X, **(kw_args if kw_args else {}))
  return func(X, **(kw_args if kw_args else {}))


(75000, 121)
(75000, 121)


In [64]:
X_train

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
27438,131887,Cash loans,M,Y,Y,0,157500.0,862560.0,25348.5,720000.0,...,0,0,0,0,,,,,,
4648,105436,Cash loans,F,N,Y,0,135000.0,167895.0,16735.5,157500.0,...,0,0,0,0,0.0,0.0,0.0,1.0,3.0,0.0
23434,127268,Cash loans,M,N,Y,0,202500.0,284400.0,16456.5,225000.0,...,0,0,0,0,0.0,0.0,0.0,1.0,2.0,7.0
20664,124091,Cash loans,M,N,Y,0,360000.0,835380.0,35523.0,675000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3520,104111,Cash loans,F,N,N,2,135000.0,781920.0,28215.0,675000.0,...,0,0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54399,163017,Cash loans,F,N,N,0,112500.0,231813.0,17460.0,193500.0,...,0,0,0,0,,,,,,
39078,145258,Cash loans,M,Y,N,2,135000.0,74628.0,8568.0,67500.0,...,0,0,0,0,,,,,,
62715,172743,Cash loans,F,N,N,0,180000.0,284400.0,16456.5,225000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
54977,163701,Cash loans,M,Y,N,0,180000.0,1546020.0,42642.0,1350000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
pd.DataFrame(X_train_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,120
0,-1.510939,-0.861206,-0.027986,0.508301,-0.127107,0.404201,-1.446821,-0.617132,-0.608339,0.325140,...,0.090808,0.076047,0.079341,0.080995,0.082258,0.102371,0.082789,0.081603,0.083986,0.081606
1,0.310345,-0.861206,-0.082308,-0.139392,-1.262154,-0.252438,-0.172957,-1.422921,-0.609514,-2.273223,...,0.090808,0.076047,0.079341,0.080995,0.078841,0.094124,0.082789,0.081603,0.074188,0.081606
2,-2.492280,-0.861206,0.072534,-1.517358,-1.332997,-1.874100,-0.158669,0.320088,-0.754826,0.325140,...,0.090808,0.076047,0.079341,0.080995,0.082258,0.067133,0.082789,0.081603,0.083986,0.081606
3,0.310345,-0.861206,0.361252,0.466635,0.461578,0.319084,0.734733,0.320088,-0.654301,0.339391,...,0.090808,0.076047,0.079341,0.080995,0.082258,0.076300,0.082789,0.081603,0.083986,0.081606
4,0.310345,1.162056,-0.082308,0.379241,0.076522,0.319084,-0.154515,0.170689,-0.642111,0.879986,...,0.090808,0.076047,0.080115,0.098428,0.082258,0.094124,0.082789,0.081603,0.074188,0.081606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74995,0.140315,-0.861206,-0.139752,-2.430128,-1.098677,-2.689323,-0.133502,0.320088,1.145253,0.325140,...,0.090808,0.058040,0.079341,0.080995,0.082258,0.056691,0.082789,0.081603,0.074188,0.081606
74996,-0.477828,1.162056,-0.082308,-0.139392,-0.019322,-0.252438,-0.133502,0.577830,-0.626822,0.735867,...,0.053380,0.076047,0.079341,0.109910,0.078841,0.102371,0.082789,0.081603,0.083986,0.081606
74997,0.365460,-0.861206,0.023536,-1.517358,-1.332997,-1.874100,-0.008157,0.320088,1.145253,0.393876,...,0.090808,0.076047,0.079341,0.080995,0.079792,0.056691,0.082789,0.081603,0.083986,0.081606
74998,0.157917,-0.861206,0.023536,1.214463,0.732615,1.166093,-0.226502,0.320088,-0.651450,0.675903,...,0.053380,0.076047,0.079341,0.065979,0.078841,0.102371,0.082789,0.081603,0.083986,0.081606


In [None]:
opt.fit(X_train_transformed, y_train)

Fitting 9 folds for each of 1 candidates, totalling 9 fits


In [None]:
pd.DataFrame(opt.cv_results_)

In [None]:
opt.best_params_

In [None]:
opt.best_estimator_

In [None]:
roc_auc_score(y_test, opt.predict(X_test_transformed))

In [None]:
# scoring on test dataset
test_df=pd.read_csv('data/application_test.csv')
test_df_=test_df.loc[:, ~test_df.columns.isin(['SK_ID_CURR'])]
test_df_transformed=pipe.transform(test_df_)

submission_df=test_df[['SK_ID_CURR']]
submission_df['TARGET']=opt.predict(test_df_transformed)
submission_df.to_csv('submission.csv', index=False)