In [1]:
#!pip install bayesian-optimization
from bayes_opt import BayesianOptimization

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
import gc
import time
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold

from lightgbm import LGBMClassifier

from scipy.stats import ranksums

In [3]:
def reduce_mem_usage(data, verbose = True):
    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return data

In [4]:
def corr_feature_with_target(feature, target):
    c0 = feature[target == 0].dropna()
    c1 = feature[target == 1].dropna()
        
    if set(feature.unique()) == set([0, 1]):
        diff = abs(c0.mean(axis = 0) - c1.mean(axis = 0))
    else:
        diff = abs(c0.median(axis = 0) - c1.median(axis = 0))
        
    p = ranksums(c0, c1)[1] if ((len(c0) >= 20) & (len(c1) >= 20)) else 2
        
    return [diff, p]

def clean_data(data):
    warnings.simplefilter(action = 'ignore')
    
    # Removing empty features
    nun = data.nunique()
    empty = list(nun[nun <= 1].index)
    
    data.drop(empty, axis = 1, inplace = True)
    print('After removing empty features there are {0:d} features'.format(data.shape[1]))
    # Removing features with the same distribution on 0 and 1 classes
    corr = pd.DataFrame(index = ['diff', 'p'])
    ind = data[data['TARGET'].notnull()].index
    
    for c in data.columns.drop('TARGET'):
        corr[c] = corr_feature_with_target(data.loc[ind, c], data.loc[ind, 'TARGET'])

    corr = corr.T
    corr['diff_norm'] = abs(corr['diff'] / data.mean(axis = 0))
    
    to_del_1 = corr[((corr['diff'] == 0) & (corr['p'] > .05))].index
    to_del_2 = corr[((corr['diff_norm'] < .5) & (corr['p'] > .05))].drop(to_del_1).index
    to_del = list(to_del_1) + list(to_del_2)
    if 'SK_ID_CURR' in to_del:
        to_del.remove('SK_ID_CURR')
        
    data.drop(to_del, axis = 1, inplace = True)
    print('After removing features with the same distribution on 0 and 1 classes there are {0:d} features'.format(data.shape[1]))
    
    # Removing features with not the same distribution on train and test datasets
    corr_test = pd.DataFrame(index = ['diff', 'p'])
    target = data['TARGET'].notnull().astype(int)
    
    for c in data.columns.drop('TARGET'):
        corr_test[c] = corr_feature_with_target(data[c], target)

    corr_test = corr_test.T
    corr_test['diff_norm'] = abs(corr_test['diff'] / data.mean(axis = 0))
    
    bad_features = corr_test[((corr_test['p'] < .05) & (corr_test['diff_norm'] > 1))].index
    bad_features = corr.loc[bad_features][corr['diff_norm'] == 0].index
    
    data.drop(bad_features, axis = 1, inplace = True)
    print('After removing features with not the same distribution on train and test datasets there are {0:d} features'.format(data.shape[1]))
    
    del corr, corr_test
    gc.collect()
    
    # Removing features not interesting for classifier
    clf = LGBMClassifier(random_state = 0)
    train_index = data[data['TARGET'].notnull()].index
    train_columns = data.drop('TARGET', axis = 1).columns

    score = 1
    new_columns = []
    while score > .7:
        train_columns = train_columns.drop(new_columns)
        clf.fit(data.loc[train_index, train_columns], data.loc[train_index, 'TARGET'])
        f_imp = pd.Series(clf.feature_importances_, index = train_columns)
        score = roc_auc_score(data.loc[train_index, 'TARGET'], 
                              clf.predict_proba(data.loc[train_index, train_columns])[:, 1])
        new_columns = f_imp[f_imp > 0].index

    data.drop(train_columns, axis = 1, inplace = True)
    print('After removing features not interesting for classifier there are {0:d} features'.format(data.shape[1]))

    return data

In [5]:
def cv_scores(df, num_folds, params, stratified = False, verbose = -1, 
              save_train_prediction = False, train_prediction_file_name = 'train_prediction.csv',
              save_test_prediction = True, test_prediction_file_name = 'test_prediction.csv'):
    warnings.simplefilter('ignore')
    
    clf = LGBMClassifier(**params)

    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 1001)
    else:
        folds = KFold(n_splits = num_folds, shuffle = True, random_state = 1001)
        
    # Create arrays and dataframes to store results
    train_pred = np.zeros(train_df.shape[0])
    train_pred_proba = np.zeros(train_df.shape[0])

    test_pred = np.zeros(train_df.shape[0])
    test_pred_proba = np.zeros(train_df.shape[0])
    
    prediction = np.zeros(test_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    df_feature_importance = pd.DataFrame(index = feats)
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        print('Fold', n_fold, 'started at', time.ctime())
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', 
                verbose = verbose, early_stopping_rounds = 200)

        train_pred[train_idx] = clf.predict(train_x, num_iteration = clf.best_iteration_)
        train_pred_proba[train_idx] = clf.predict_proba(train_x, num_iteration = clf.best_iteration_)[:, 1]
        test_pred[valid_idx] = clf.predict(valid_x, num_iteration = clf.best_iteration_)
        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        prediction += \
                clf.predict_proba(test_df[feats], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits

        df_feature_importance[n_fold] = pd.Series(clf.feature_importances_, index = feats)
        
        print('Fold %2d AUC : %.6f' % (n_fold, roc_auc_score(valid_y, test_pred_proba[valid_idx])))
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    roc_auc_train = roc_auc_score(train_df['TARGET'], train_pred_proba)
    precision_train = precision_score(train_df['TARGET'], train_pred, average = None)
    recall_train = recall_score(train_df['TARGET'], train_pred, average = None)
    
    roc_auc_test = roc_auc_score(train_df['TARGET'], test_pred_proba)
    precision_test = precision_score(train_df['TARGET'], test_pred, average = None)
    recall_test = recall_score(train_df['TARGET'], test_pred, average = None)

    print('Full AUC score %.6f' % roc_auc_test)
    
    df_feature_importance.fillna(0, inplace = True)
    df_feature_importance['mean'] = df_feature_importance.mean(axis = 1)
    
    # Write prediction files
    if save_train_prediction:
        df_prediction = train_df[['SK_ID_CURR', 'TARGET']]
        df_prediction['Prediction'] = test_pred_proba
        df_prediction.to_csv(train_prediction_file_name, index = False)
        del df_prediction
        gc.collect()

    if save_test_prediction:
        df_prediction = test_df[['SK_ID_CURR']]
        df_prediction['TARGET'] = prediction
        df_prediction.to_csv(test_prediction_file_name, index = False)
        del df_prediction
        gc.collect()
    
    return df_feature_importance, \
           [roc_auc_train, roc_auc_test,
            precision_train[0], precision_test[0], precision_train[1], precision_test[1],
            recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]

def display_folds_importances(feature_importance_df_, n_folds = 5):
    n_columns = 3
    n_rows = (n_folds + 1) // n_columns
    _, axes = plt.subplots(n_rows, n_columns, figsize=(8 * n_columns, 8 * n_rows))
    for i in range(n_folds):
        sns.barplot(x = i, y = 'index', data = feature_importance_df_.reset_index().sort_values(i, ascending = False).head(20), 
                    ax = axes[i // n_columns, i % n_columns])
    sns.barplot(x = 'mean', y = 'index', data = feature_importance_df_.reset_index().sort_values('mean', ascending = False).head(20), 
                    ax = axes[n_rows - 1, n_columns - 1])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()

In [6]:
def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')
    
    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])
        
    clf = LGBMClassifier(**params, n_estimators = 10000, nthread = 4)

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    folds = KFold(n_splits = 2, shuffle = True, random_state = 1001)
        
    test_pred_proba = np.zeros(train_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', 
                verbose = False, early_stopping_rounds = 100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(train_df['TARGET'], test_pred_proba)

In [7]:
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
lgbm_params = {
            'nthread': -1,
            'n_estimators': 10000,
            'learning_rate': .02,
            'num_leaves': 34,
            'colsample_bytree': .9497036,
            'subsample': .8715623,
            'max_depth': 8,
            'reg_alpha': .041545473,
            'reg_lambda': .0735294,
            'min_split_gain': .0222415,
            'min_child_weight': 39.3259775,
            'silent': -1,
            'verbose': -1
}

In [8]:
def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')
    
    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])
        
    clf = LGBMClassifier(**params, n_estimators = 10000, nthread = 4)

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    folds = KFold(n_splits = 2, shuffle = True, random_state = 1001)
        
    test_pred_proba = np.zeros(train_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', 
                verbose = False, early_stopping_rounds = 100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(train_df['TARGET'], test_pred_proba)

In [None]:
input_files = ['../Input/app_only.csv',
              '../Input/app_burea_balance.csv',
              '../Input/app_previous.csv',
              '../Input/app_cash.csv',
              '../Input/app_installments.csv',
              '../Input/app_credit_card_balance.csv']

output_files = ['../Submissions/app_only_cleaned_optimized.csv',
               '../Submissions/app_bureau_balanced_cleaned_optimized.csv',
               '../Submissions/app_previous_cleaned_optimized.csv',
               '../Submissions/app_cash_cleaned_optimized.csv',
               '../Submissions/app_installments_cleaned_optimized',
               '../Submissions/app_credit_card_balance_cleaned_optimized.csv']


for i in range(5):
    df = pd.read_csv(input_files[i])
    df = clean_data(df)
    
    params = {'colsample_bytree': (0.6, 1),
          'learning_rate': (.001, .03), 
          'num_leaves': (25, 40), 
          'subsample': (0.6, 1), 
          'max_depth': (5, 20), 
          'reg_alpha': (.02, .08), 
          'reg_lambda': (.04, .1), 
          'min_split_gain': (.01, .1),
          'min_child_weight': (20, 45)}
    
    
    bo = BayesianOptimization(lgbm_evaluate, params)
    bo.maximize(init_points = 5, n_iter = 25)
    best_params = bo.res['max']['max_params']
    best_params['num_leaves'] = int(best_params['num_leaves'])
    best_params['max_depth'] = int(best_params['max_depth'])
    bo.res['max']['max_val']
    print('best params for ', input_files[i])
    print(best_params)
    
    feature_importance, scor = cv_scores(df, 5, lgbm_params, test_prediction_file_name = output_files[i])

After removing empty features there are 267 features
After removing features with the same distribution on 0 and 1 classes there are 160 features
After removing features with not the same distribution on train and test datasets there are 160 features
After removing features not interesting for classifier there are 136 features
[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   min_child_weight |   min_split_gain |   num_leaves |   reg_alpha |   reg_lambda |   subsample | 
    1 | 03m13s | [35m   0.76516[0m | [32m            0.9367[0m | [32m         0.0066[0m | [32m     8.2649[0m | [32m           37.6679[0m | [32m          0.0861[0m | [32m     38.1333[0m | [32m     0.0655[0m | [32m      0.0581[0m | [32m     0.6034[0m | 
   

Early stopping, best iteration is:
[1138]	training's auc: 0.835702	valid_1's auc: 0.772617
Fold  3 AUC : 0.772617
Fold 4 started at Sat Aug 11 04:45:23 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1071]	training's auc: 0.833603	valid_1's auc: 0.767933
Fold  4 AUC : 0.767933
Full AUC score 0.769153
After removing empty features there are 2082 features
After removing features with the same distribution on 0 and 1 classes there are 1345 features
After removing features with not the same distribution on train and test datasets there are 1289 features
After removing features not interesting for classifier there are 712 features
[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   min_child_w

Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1242]	training's auc: 0.862614	valid_1's auc: 0.772647
Fold  2 AUC : 0.772647
Fold 3 started at Sat Aug 11 11:59:14 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1214]	training's auc: 0.858846	valid_1's auc: 0.782601
Fold  3 AUC : 0.782601
Fold 4 started at Sat Aug 11 12:06:55 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1764]	training's auc: 0.883562	valid_1's auc: 0.778305
Fold  4 AUC : 0.778305
Full AUC score 0.778281
After removing empty features there are 1126 features
After removing features with the same distribution on 0 and 1 classes there are 710 features
After removing features with not the same distribution on train and test datasets there are 710 features
After removing features not interesting for classifier there are 614 features
[31mInitialization[0m
[94m

Starting LightGBM. Train shape: (307500, 614), test shape: (48744, 614)
Fold 0 started at Sat Aug 11 20:00:29 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1202]	training's auc: 0.863745	valid_1's auc: 0.77883
Fold  0 AUC : 0.778830
Fold 1 started at Sat Aug 11 20:07:04 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1096]	training's auc: 0.859728	valid_1's auc: 0.781387
Fold  1 AUC : 0.781387
Fold 2 started at Sat Aug 11 20:13:10 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1672]	training's auc: 0.885699	valid_1's auc: 0.775671
Fold  2 AUC : 0.775671
Fold 3 started at Sat Aug 11 20:21:25 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1545]	training's auc: 0.880287	valid_1's auc: 0.781152
Fold  3 AUC : 0.781152
Fold 4 started at Sat Aug 11 20:29:20 2018
Traini

Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[997]	training's auc: 0.84082	valid_1's auc: 0.774642
Fold  0 AUC : 0.774642
Fold 1 started at Sat Aug 11 23:10:12 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1287]	training's auc: 0.8548	valid_1's auc: 0.779336
Fold  1 AUC : 0.779336
Fold 2 started at Sat Aug 11 23:12:10 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1180]	training's auc: 0.850061	valid_1's auc: 0.77245
Fold  2 AUC : 0.772450
Fold 3 started at Sat Aug 11 23:14:03 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1400]	training's auc: 0.859278	valid_1's auc: 0.778225
Fold  3 AUC : 0.778225
Fold 4 started at Sat Aug 11 23:16:11 2018
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1187]	training's auc: 0.84