In [1]:
import os
import pickle
import warnings
import pandas as pd
import numpy as np
import lightgbm as lgb

from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from sklearn.metrics import roc_auc_score
import pickle
from sklearn.model_selection import StratifiedKFold

In [2]:
warnings.filterwarnings('ignore')
random_state = 42
folds = 5

In [3]:
# path = "/content/drive/MyDrive/Pred Project/"
path = ""

In [4]:
df_train = pd.read_csv(path + "train.csv")
df_test = pd.read_csv(path + "test.csv")

In [5]:
# Real/Synthetic data
test = df_test.drop(["ID_code"], axis = 1)
unique_count = np.zeros_like(test)
for feature in range(test.shape[1]):
    _, index, count = np.unique(test.iloc[:, feature], return_counts=True, return_index=True)
    unique_count[index[count == 1], feature] += 1
real_samples = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synth_samples = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
print('Number of real samples in test set is {}'.format(len(real_samples)))
print('Number of synthetic samples in test set is {}'.format(len(synth_samples)))
features = [col for col in df_train.columns if col.startswith('var')]
df_all = pd.concat([df_train, df_test.iloc[real_samples]])
for feature in features:
    temp = df_all[feature].value_counts(dropna=True)
    df_all[feature + 'vc'] = df_all[feature].map(temp).map(lambda x: min(10, x)).astype(np.uint8)
    df_all[feature + 'sum'] = ((df_all[feature]) * df_all[feature + 'vc'] \
                                  .map(lambda x: int(x > 1))).astype(np.float32)
    df_all[feature + 'sum2'] = ((df_all[feature]) * df_all[feature + 'vc'] \
                                 .map(lambda x: int(x > 2))).astype(np.float32)
    df_all[feature + 'sum3'] = ((df_all[feature]) * df_all[feature + 'vc'] \
                                 .map(lambda x: int(x > 4))).astype(np.float32)
    df_all[feature + "_var"] = df_all.groupby([feature])[feature].transform("var")
    df_all[feature + "plus_"] = df_all[feature] + df_all[feature + "_var"]
    # df_all[feature + "minus_"] = df_all[feature] - df_all[feature + "_var"]
    df_all.drop([feature + "_var"], axis = 1, inplace = True)
df_train = df_all.iloc[:df_train.shape[0]]
df_test = df_all.iloc[df_train.shape[0]:]


Number of real samples in test set is 100000
Number of synthetic samples in test set is 100000


In [6]:
df_train.drop(["ID_code"], axis = 1 , inplace=True)

In [7]:
y = df_train["target"]
train = df_train.drop(["target"], axis = 1)

space  = [
          Real(0.001, 0.2, name ='learning_rate'),
          Integer(100, 1000, name='max_bin'),
          Integer(1, 50, name='max_depth'),
          Integer(2, 500, name='num_leaves'),
          Integer(3, 200, name='min_data_in_leaf'),
          Real(0.1, 0.90, name='subsample'),
          Real(0.0001, 100, name = 'reg_alpha'),
          Real(0.0001, 100, name = 'reg_lambda'),
          Real(0.001, 1, name='min_split_gain'),
          Real(0.0001, 1, name='min_sum_hessian_in_leaf'),
          Categorical([True, False], name='boost_from_average')
         ]

def objective(values):
    params = {
            'learning_rate':values[0],
            'max_bin':values[1],
            'max_depth': values[2], 
            'num_leaves': values[3], 
            'min_data_in_leaf': values[4],
            'subsample': values[5], 
            'reg_alpha': values[6], 
            'reg_lambda': values[7],
            'min_split_gain': values[8],
            'min_sum_hessian_in_leaf': values[9],
            'boost_from_average': values[10],
            'is_unbalance': 'true',
            'boosting_type': 'gbdt', 
            'n_estimators': 5000, 
            'num_class': 2, 
            'objective': 'multiclass', 
            'metric': 'multi_logloss', 
            'device': 'cpu', 
            'n_jobs': -1, 
            'verbose':-1
            }

    print('\nNext set of params.....')
    print(params)
    oof_preds = np.zeros((len(train), np.unique(y).shape[0]))
    folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)

    oof_preds = np.zeros((len(train), np.unique(y).shape[0]))
    
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        for feature in features:

          X_tr = lgb.Dataset(train[[feature, feature + "plus_", feature + "vc", feature + "sum", feature + "sum2", feature + "sum3"]].iloc[trn_], y.iloc[trn_])
          X_va = lgb.Dataset(train[[feature, feature + "plus_", feature + "vc", feature + "sum", feature + "sum2", feature + "sum3"]].iloc[val_], y.iloc[val_])

          model = lgb.train(params, X_tr, valid_sets=X_va, num_boost_round=1000, verbose_eval=None, early_stopping_rounds=10)
          oof_preds[val_, :] += model.predict(train[[feature, feature + "plus_", feature + "vc", feature + "sum", feature + "sum2", feature + "sum3"]].iloc[val_]) / len(features)
          print('no {}-fold auc: {}'.format(fold_ + 1, 
                roc_auc_score(y.iloc[val_], oof_preds[val_,:][:, 1])))

    score =  roc_auc_score(y, oof_preds[:, 1])
    print('Score: {:.5f}'.format(score))
    return  -score

res_gp = gp_minimize(objective, space, n_calls = 60 , random_state = random_state, n_random_starts = 15)

print(res_gp)

with open('res_bayesian.pkl', 'wb') as f :
    pickle.dump(res_gp,f)


Next set of params.....
{'learning_rate': 0.1595120543851864, 'max_bin': 265, 'max_depth': 40, 'num_leaves': 299, 'min_data_in_leaf': 91, 'subsample': 0.17997993265440235, 'reg_alpha': 45.92494327169753, 'reg_lambda': 33.37092774304108, 'min_split_gain': 0.14372395110401887, 'min_sum_hessian_in_leaf': 0.6509233841015581, 'boost_from_average': False, 'is_unbalance': 'true', 'boosting_type': 'gbdt', 'n_estimators': 5000, 'num_class': 2, 'objective': 'multiclass', 'metric': 'multi_logloss', 'device': 'cpu', 'n_jobs': -1, 'verbose': -1}
no 1-fold auc: 0.5454511600594109
no 1-fold auc: 0.5729687805930194
no 1-fold auc: 0.5946754413221012
no 1-fold auc: 0.5946754413221012
no 1-fold auc: 0.5946754413221012
no 1-fold auc: 0.6025625046499398
no 1-fold auc: 0.6233504911132979
no 1-fold auc: 0.6233504911132979
no 1-fold auc: 0.6233504911132979
no 1-fold auc: 0.6233504911132979
no 1-fold auc: 0.6233504911132979
no 1-fold auc: 0.6233504911132979
no 1-fold auc: 0.6447237713226126
no 1-fold auc: 0.6

In [None]:
res_gp.x

[26,
 2,
 63,
 0.8145256334205867,
 0.44705736309891225,
 0.2,
 0.0001,
 30.075849202382976,
 'gbdt']