In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Modeling
import lightgbm as lgb
import xgboost as xgb

# Evaluation of the model
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.size'] = 18
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Custom

from utils.preprocessing import one_hot_encoder
from utils.preprocessing import missing_values_table

### Set up training and test set

In [2]:
spec_feature = pd.read_csv('~/HomeCredit/feature_matrix_spec.csv')
missing_table = missing_values_table(spec_feature)

dump_feats = missing_table[missing_table['% of Total Values'] > 30].index.tolist()

spec_feature = spec_feature.drop(dump_feats, axis = 1)

spec_feature, _ = one_hot_encoder(spec_feature)

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer

Target = spec_feature['TARGET']
ID = spec_feature['SK_ID_CURR']

dataset_temp = spec_feature.drop(['TARGET','SK_ID_CURR'], axis = 1)

imputer = Imputer(strategy = 'median')
scaler = MinMaxScaler(feature_range = (0,1))

dataset_preprocessed = imputer.fit_transform(dataset_temp)
dataset_preprocessed = scaler.fit_transform(dataset_preprocessed)
dataset_preprocessed = pd.DataFrame(dataset_preprocessed, columns = dataset_temp.columns)

dataset_preprocessed['TARGET'] = Target
dataset_preprocessed['SK_ID_CURR'] = ID

train_df = dataset_preprocessed[dataset_preprocessed['TARGET'] != -999]
test_df = dataset_preprocessed[dataset_preprocessed['TARGET'] == -999]

train_features = train_df.drop(['TARGET'], axis = 1)
train_labels = train_df['TARGET']

# Training set
#train_set = lgb.Dataset(train_features, label = train_labels)

There are total 885 columns.
823 of them have missing values.


## Baseline

In [3]:
test_features = test_df.drop(['TARGET'], axis = 1)
test_labels = test_df['TARGET']

In [4]:
N = 15000
T = 3000
X_train, Y_train = train_features[:N], train_labels[:N]
X_test, Y_test = test_features[:T], test_labels[:T]
print(X_train.shape)

(15000, 790)


In [5]:
clr_xgb = xgb.XGBClassifier(n_estimators=500, n_jobs = -1)

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
# accuracy
acc_scores = cross_val_score(clr_xgb, X_train, Y_train, cv = 5)

print('5 fold CV accuracy for random forest is %0.2f (+/- %f)' %(acc_scores.mean(), acc_scores.std() * 2))

# roc auc score
roc_auc_scores = cross_val_score(clr_xgb, X_train, Y_train, cv=5, scoring='roc_auc')
print('5 fold CV ROC_AUC is %0.2f (+/- %0.2f)' %(roc_auc_scores.mean(), roc_auc_scores.std() * 2))

5 fold CV accuracy for random forest is 0.77 (+/- 0.599057)
5 fold CV ROC_AUC is 0.72 (+/- 0.10)


In [23]:
print('5 fold CV ROC_AUC is %0.5f (+/- %0.5f)' %(roc_auc_scores.mean(), roc_auc_scores.std() * 2))

5 fold CV ROC_AUC is 0.71661 (+/- 0.09833)


In [10]:
clr_xgb.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 500,
 'n_jobs': -1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': True,
 'subsample': 1}

In [11]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [15]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(clr_xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=-1, cv=skf.split(X_train,Y_train), verbose=3, random_state=1001 )

random_search.fit(X_train, Y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  25 | elapsed:  3.3min remaining: 23.9min
[Parallel(n_jobs=-1)]: Done  12 out of  25 | elapsed:  4.9min remaining:  5.3min
[Parallel(n_jobs=-1)]: Done  21 out of  25 | elapsed:  7.3min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  7.9min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7fd98c9b2258>,
          error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=5, n_jobs=-1,
          param_distributions={'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5]},
          pre_dispatch='2*n_jobs', random_state=1001, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=3)

In [16]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 All results:
{'mean_fit_time': array([194.46544447, 294.02675261, 283.21006293, 169.69401412,
       206.34997072]), 'std_fit_time': array([ 4.28267192,  1.64765154,  2.06131266, 13.86781528, 25.30580188]), 'mean_score_time': array([0.16550169, 0.2563447 , 0.27811236, 0.16641884, 0.13767467]), 'std_score_time': array([0.04360121, 0.10175452, 0.05104975, 0.06606588, 0.00173767]), 'param_subsample': masked_array(data=[1.0, 0.6, 0.8, 1.0, 0.8],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[5, 1, 5, 5, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[3, 5, 5, 5, 4],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[5, 1.5, 1, 5, 1],
             mask=[False, False, False, False, False],
       fill_val

In [17]:
test_preds = random_search.predict_proba(test_features)

In [21]:
best_paras = {'subsample': 1.0, 'min_child_weight': 5, 'max_depth': 3, 'gamma': 5, 'colsample_bytree': 1.0}
clr_xgb_best = xgb.XGBClassifier(n_estimators=500, subsample=1, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0, n_jobs = -1)
#cross_val_score(clr_xgb_best, X=X_train, y=Y_train, scoring='roc_auc', cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001))
# roc auc score
roc_auc_scores_best_para = cross_val_score(clr_xgb_best, X_train, Y_train, cv=5, scoring='roc_auc')


5 fold CV ROC_AUC is 0.74 (+/- 0.06)


In [22]:
print('5 fold CV ROC_AUC is %0.5f (+/- %0.5f)' %(roc_auc_scores_best_para.mean(), roc_auc_scores_best_para.std() * 2))

5 fold CV ROC_AUC is 0.74125 (+/- 0.06496)


In [None]:
# for the whole dataset
roc_auc_scores_best_para = cross_val_score(clr_xgb_best, train_features, train_labels, cv=5, scoring='roc_auc')
print('5 fold CV ROC_AUC is %0.5f (+/- %0.5f)' %(roc_auc_scores_best_para.mean(), roc_auc_scores_best_para.std() * 2))