In [1]:
import pandas as pd
import numpy as np
import os

import seaborn as sns
import matplotlib.pyplot as plt
import random


import os
import time
import datetime
import calendar

from sklearn.metrics import mean_squared_error


import sklearn
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import KFold, StratifiedKFold


from sklearn import clone
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# from utils_model import * # expand later

# Preprocess

In [2]:
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

data['gender'] = data['gender'].astype('str')
data_test['gender'] = data_test['gender'].astype('str')

data['Achievement_above_100%_during3quartal'] = data['Achievement_above_100%_during3quartal'].astype(str)
data_test['Achievement_above_100%_during3quartal'] = data_test['Achievement_above_100%_during3quartal'].astype(str)


data = data.rename(columns={'annual leave':'annual_leave'})
data_test = data_test.rename(columns={'annual leave':'annual_leave'})

data = data.rename(columns={'Last_achievement_%':'Last_achievement'})
data_test = data_test.rename(columns={'Last_achievement_%':'Last_achievement'})


In [3]:
# get test data (for final evaluation)
X = data.drop(columns=['Best Performance'])
y = data['Best Performance']
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

In [4]:
# len(X_train),len(X_val)

In [6]:
num_cols = list(X.select_dtypes(exclude=['object']))
cat_cols = list(X.select_dtypes(include=['object']))
features = list(X.columns)
len(num_cols),len(cat_cols),len(features)

(14, 7, 21)

In [7]:

a = '''boosting_type='gbdt',
    num_leaves=31,
    max_depth=-1,
    learning_rate=0.1,
    n_estimators=100,
    subsample_for_bin=200000,
    objective=None,
    class_weight=None,
    min_split_gain=0.0,
    min_child_weight=0.001,
    min_child_samples=20,
    subsample=1.0,
    subsample_freq=0,
    colsample_bytree=1.0,
    reg_alpha=0.0,
    reg_lambda=0.0,
    random_state=None
    '''
# [x.strip() for x in a.split(',\n')]
[x.split('=')[0].strip() for x in a.split('\n')]

['boosting_type',
 'num_leaves',
 'max_depth',
 'learning_rate',
 'n_estimators',
 'subsample_for_bin',
 'objective',
 'class_weight',
 'min_split_gain',
 'min_child_weight',
 'min_child_samples',
 'subsample',
 'subsample_freq',
 'colsample_bytree',
 'reg_alpha',
 'reg_lambda',
 'random_state',
 '']

# Bayesian Optimization

## FE

In [8]:
class Feature_Engineering:
    def __init__(self,parameters):
        self.parameters = parameters
        self.target = parameters['target']
    
    @staticmethod  
    def check_col(col):
        if len(col.split(' '))>1:
            col2 = '_'.join(col.split(' '))
        else:
            col2 = col
        return col2
    
    @staticmethod
    def get_bin(data,col,n_bin,mode='cut'):
        while True:
            try:
                if mode=='cut':
                    _,bin_dummy = pd.cut(data[col],n_bin,retbins=True)
                else:
                    _,bin_dummy = pd.qcut(data[col],n_bin,retbins=True)
            except:
                n_bin -= 1
                continue
            break
        return bin_dummy
        
    def fit(self,data_ori):
        target = self.target
        data = data_ori.copy()
        for param in self.parameters['bin_numer_qcut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='qcut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_numer_qcut',bin_dummy)
        for param in self.parameters['bin_numer_cut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='cut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_numer_cut',bin_dummy)
            
            
        for param in self.parameters['bin_add_categ_numer_bin_qcut']:
            col = param[1]
            n_bin = param[2]
            bin_dummy = self.get_bin(data,col,n_bin,mode='qcut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_qcut_add_categ',bin_dummy)
        
        for param in self.parameters['bin_target_encoding_cut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='cut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_cut',bin_dummy)
            
            data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
            data_dummy = data.groupby([f'{col}_bin_target_encoding_cut'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_bin_target_encoding_cut',data_dummy)
            
        for param in self.parameters['bin_target_encoding_qcut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='qcut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_qcut',bin_dummy)
            
            data[f'{col}_bin_target_encoding_qcut'] = pd.cut(data[col],bins=bin_dummy)
            data_dummy = data.groupby([f'{col}_bin_target_encoding_qcut'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_bin_target_encoding_qcut',data_dummy)
           
        for param in self.parameters['bin_target_encoding_custom_bin']:
            col = param[0]
            bins = param[1]
            setattr(self,f'{col}_bin_custom_bin',bins)
            
            data[f'{col}_bin_target_encoding_custom_bin'] = pd.cut(data[col],bins=bins)
            data_dummy = data.groupby([f'{col}_bin_target_encoding_custom_bin'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_bin_target_encoding_custom_bin',data_dummy)
        
        for param in self.parameters['categorical_mean_encoding']:
            col = param
            data[f'{col}_categorical_mean_encoding'] = data[col].copy().values
            data_dummy = data.groupby([f'{col}_categorical_mean_encoding'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_categorical_mean_encoding',data_dummy)
        
        
        
        self.fit = True
        return data
        
    
    def transform(self,X,mode='train'):
        data = X.copy()
        target = self.target
        if mode!='train':
            target_encode = self.target
        else:
            target_encode = self.target +"_y"
            
        if self.fit==False:
            raise Exception("Fit to train data first")
        
        for param in self.parameters['bin_numer_qcut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_numer_qcut')
            data[f'{col}_bin_numer_qcut'] = pd.cut(data[col],bins=bin_dummy).astype(str).values
        for param in self.parameters['bin_numer_cut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_numer_cut')
            data[f'{col}_bin_numer_cut'] = pd.cut(data[col],bins=bin_dummy).astype(str).values
            
        for cols in self.parameters['bin_add_categ_numer_bin_qcut']:
            col_add = cols[0] + '_' + cols[1]
            bin_dummy = eval(f'self.{cols[1]}_bin_qcut_add_categ')
            data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
            data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = (data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)).values
        
        for param in self.parameters['bin_target_encoding_cut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_cut')
            data_dummy = eval(f'self.{col}_bin_target_encoding_cut')
            data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy).values
            data[f'{col}_bin_target_encoding_cut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_cut'])[f'{target_encode}'].values
        
        for param in self.parameters['bin_target_encoding_qcut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_qcut')
            data_dummy = eval(f'self.{col}_bin_target_encoding_qcut')
            data[f'{col}_bin_target_encoding_qcut'] = pd.cut(data[col],bins=bin_dummy).values
            data[f'{col}_bin_target_encoding_qcut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_qcut'])[f'{target_encode}'].values
        
        for param in self.parameters['bin_target_encoding_custom_bin']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_custom_bin')
            data_dummy = eval(f'self.{col}_bin_target_encoding_custom_bin')
            data[f'{col}_bin_target_encoding_custom_bin'] = pd.cut(data[col],bins=bin_dummy).values
            data[f'{col}_bin_target_encoding_custom_bin'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_custom_bin'])[f'{target_encode}'].values
        
        for param in self.parameters['categorical_mean_encoding']:
            col = param
            data_dummy = eval(f'self.{col}_categorical_mean_encoding')
            data[f'{col}_categorical_mean_encoding'] = data[col].copy().values
            data[f'{col}_categorical_mean_encoding'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_categorical_mean_encoding'])[f'{target_encode}'].values
        
        
        for cols in self.parameters['multiply']:
            data[cols[0] + 'x' +cols[1]] = (data[cols[0]] * data[cols[1]]).values
        for cols in self.parameters['add']:
            data[cols[0] + '+' +cols[1]] = (data[cols[0]] + data[cols[1]]).values
        for cols in self.parameters['add_str']:
            data[cols[0] + '+' +cols[1]] = (data[cols[0]].astype(str)+'_' + data[cols[1]].astype(str)).values
            
        for cols in self.parameters['substract']:
            data[cols[0] + '-' +cols[1]] = (data[cols[0]] - data[cols[1]]).values
        for cols in self.parameters['divide']:
            data[cols[0] + '/' +cols[1]] = (data[cols[0]] / np.where(data[cols[1]]==0,0.0001,data[cols[1]])).values
        
        
        
        
        return data
    
    

## Data Transform

In [10]:
data_skf = {'train':[],'val':[]}
skf = StratifiedKFold(n_splits=2,random_state = 3,shuffle = True)
parameters = {'multiply':[['GPA','number_of_dependences']],
              'add':[['annual_leave','sick_leaves'],['assign_of_otherposition','branch_rotation']],
              'add_str':[['Education_level','job_level']],
              'substract':[],'divide':[],
              'bin_numer_qcut':[],
              'bin_numer_cut':[['GPA',30]],
              'bin_add_categ_numer_bin_qcut':[['job_level','GPA',5],['Education_level','GPA',5]],
            'bin_target_encoding_cut':[],
             'bin_target_encoding_qcut':[['year_graduated',5],['GPA',5],['annual_leave',5]],
             'bin_target_encoding_custom_bin':[],
              'categorical_mean_encoding':['job_level','person_level','Employee_type','Education_level'],
             'target':'Best Performance'}


for train_index,val_index in skf.split(X,y):
    add_fe = Feature_Engineering(parameters)
    add_fe.fit(data.iloc[train_index,:])
    data_skf['train'].append([add_fe.transform(X.iloc[train_index,:],mode='val'),y.iloc[train_index]])
    data_skf['val'].append([add_fe.transform(X.iloc[val_index,:],mode='val'),y.iloc[val_index]])

num_cols_fe = list(data_skf['train'][0][0].select_dtypes(exclude='object').columns)
cat_cols_fe = list(data_skf['train'][0][0].select_dtypes(include='object').columns)
len(num_cols_fe),len(cat_cols_fe)

(24, 11)

## LGBM

In [12]:
from bayes_opt import BayesianOptimization

num_transformer = Pipeline(steps=[
                                ('imputer', SimpleImputer(strategy = 'median')),
                                ('scaler', RobustScaler())
                                ])

cat_transformer = Pipeline(steps=[
                                ('imputer', SimpleImputer(strategy='most_frequent')),
                                ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                ])
    
def search_model_lgbm(num_leaves,max_depth,n_estimators,reg_alpha,reg_lambda):
    params = {
        'boosting_type':'gbdt',
         'num_leaves':int(num_leaves),
         'max_depth':int(max_depth),
         'n_estimators':int(n_estimators),
         'objective':'binary',
#          'class_weight':'balanced',
         'reg_alpha':reg_alpha,
         'reg_lambda':reg_lambda,
         'random_state':0}
    lgbm = LGBMClassifier(**params)
    transformer = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_cols_fe),
            ('cat', cat_transformer, cat_cols_fe)
        ])
    main_pipeline = Pipeline(steps=[('transformer', transformer),
                      ('classifier', lgbm)])
    
    aucs = []
    for i in range(len(data_skf['train'])):
        model = clone(main_pipeline)
        model.fit(data_skf['train'][i][0],data_skf['train'][i][1].values)
        pred_proba = model.predict_proba(data_skf['val'][i][0])[:,1]
        
        aucs.append(roc_auc_score(data_skf['val'][i][1].values, pred_proba,average='weighted'))
    return np.mean(aucs)

lgbBO = BayesianOptimization(search_model_lgbm, {'num_leaves': (25, 100),
                                        'max_depth': (5, 100),
                                        'n_estimators':(50,300),
                                        'reg_alpha': (0.0, 0.05),
                                        'reg_lambda': (0.0, 0.05),
                                        },random_state=0)

lgbBO.maximize(n_iter=100, init_points=2)
        
        

|   iter    |  target   | max_depth | n_esti... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.5466  [0m | [0m 57.14   [0m | [0m 228.8   [0m | [0m 70.21   [0m | [0m 0.02724 [0m | [0m 0.02118 [0m |
| [95m 2       [0m | [95m 0.5486  [0m | [95m 66.36   [0m | [95m 159.4   [0m | [95m 91.88   [0m | [95m 0.04818 [0m | [95m 0.01917 [0m |
| [0m 3       [0m | [0m 0.5475  [0m | [0m 69.46   [0m | [0m 158.9   [0m | [0m 91.43   [0m | [0m 0.01489 [0m | [0m 0.01442 [0m |
| [0m 4       [0m | [0m 0.5451  [0m | [0m 69.15   [0m | [0m 105.8   [0m | [0m 80.83   [0m | [0m 0.02445 [0m | [0m 0.0194  [0m |
| [0m 5       [0m | [0m 0.5478  [0m | [0m 64.45   [0m | [0m 164.3   [0m | [0m 93.63   [0m | [0m 0.03876 [0m | [0m 0.004643[0m |
| [95m 6       [0m | [95m 0.5497  [0m | [95m 61.39   [0m | [95m 153.5   [0m | [95m 91.73   [0m | [95

| [0m 58      [0m | [0m 0.5482  [0m | [0m 78.67   [0m | [0m 216.3   [0m | [0m 96.3    [0m | [0m 0.01304 [0m | [0m 0.03063 [0m |
| [0m 59      [0m | [0m 0.5511  [0m | [0m 74.62   [0m | [0m 138.6   [0m | [0m 42.67   [0m | [0m 0.0249  [0m | [0m 0.02317 [0m |
| [0m 60      [0m | [0m 0.5455  [0m | [0m 27.67   [0m | [0m 202.6   [0m | [0m 43.5    [0m | [0m 0.009518[0m | [0m 0.03645 [0m |
| [0m 61      [0m | [0m 0.5476  [0m | [0m 25.59   [0m | [0m 244.0   [0m | [0m 99.6    [0m | [0m 0.01085 [0m | [0m 0.02173 [0m |
| [0m 62      [0m | [0m 0.5488  [0m | [0m 22.98   [0m | [0m 81.21   [0m | [0m 47.78   [0m | [0m 0.01837 [0m | [0m 0.0465  [0m |
| [0m 63      [0m | [0m 0.5526  [0m | [0m 37.51   [0m | [0m 71.04   [0m | [0m 82.11   [0m | [0m 0.009463[0m | [0m 0.02728 [0m |
| [0m 64      [0m | [0m 0.5521  [0m | [0m 65.44   [0m | [0m 62.93   [0m | [0m 94.63   [0m | [0m 0.01357 [0m | [0m 0.04303 [0m |
| [0m

In [13]:
lgbBO.max


{'target': 0.5573626004632181,
 'params': {'max_depth': 71.14463645613029,
  'n_estimators': 64.46799939822702,
  'num_leaves': 32.02057967524939,
  'reg_alpha': 0.00910889806421355,
  'reg_lambda': 0.008254388572148113}}

In [20]:
# params = {
#     'boosting_type':'gbdt',
#      'num_leaves':int(60.80441469032743),
#      'max_depth':int(5.663953027572844),
#      'n_estimators':int(52.29955231632742),
#      'objective':'binary',
#      'class_weight':'balanced',
#      'reg_alpha':0.0070860708219850025,
#      'reg_lambda':0.04598746020415847,
#      'random_state':0}

params = {
    'boosting_type':'gbdt',
     'num_leaves':int(32.02057967524939),
     'max_depth':int(71.14463645613029),
     'n_estimators':int(64.46799939822702),
     'objective':'binary',
#      'class_weight':'balanced',
     'reg_alpha':0.00910889806421355,
     'reg_lambda':0.008254388572148113,
     'random_state':0}

In [21]:

lgbm = LGBMClassifier(**params)
transformer = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols_fe),
        ('cat', cat_transformer, cat_cols_fe)
    ])
main_pipeline = Pipeline(steps=[('transformer', transformer),
                  ('classifier', lgbm)])

aucs = []
for i in range(len(data_skf['train'])):
    model = clone(main_pipeline)
    model.fit(data_skf['train'][i][0],data_skf['train'][i][1].values)
    pred_proba = model.predict_proba(data_skf['val'][i][0])[:,1]

    aucs.append(roc_auc_score(data_skf['val'][i][1].values, pred_proba,average='samples'))

np.mean(aucs)

0.5573626004632181

In [95]:
add_fe = Feature_Engineering(parameters)
add_fe.fit(data.iloc[train_index,:])

X_train, y_train = add_fe.transform(X,mode='val'),y.copy()
X_test = add_fe.transform(data_test,mode='val')


model = clone(main_pipeline)
model.fit(X_train,y_train.values)

pred_proba = model.predict_proba(X_test)[:,1]
df_submission = pd.DataFrame({'index':data_test.index,'Best Performance':pred_proba})
df_submission

df_submission.to_csv('df_submission_15feb_LGBM1CVTUNE_FE.csv',index=False)

In [100]:
len(pred_proba[pred_proba>0.5])/len(pred_proba)

0.36966666666666664

## XGB

In [16]:
from bayes_opt import BayesianOptimization

num_transformer = Pipeline(steps=[
                                ('imputer', SimpleImputer(strategy = 'median')),
                                ('scaler', RobustScaler())
                                ])

cat_transformer = Pipeline(steps=[
                                ('imputer', SimpleImputer(strategy='most_frequent')),
                                ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                ])
    
def search_model_xgb(num_leaves,max_depth,n_estimators,reg_alpha,reg_lambda):
    params = {
         'max_depth':int(max_depth),
         'n_estimators':int(n_estimators),
         'reg_alpha':reg_alpha,
         'reg_lambda':reg_lambda,
#         'scale_pos_weight':scale_pos_weight,
         'random_state':0,
        'use_label_encoder':False,
        'verbosity':0
    }
    xgb = XGBClassifier(**params)
    transformer = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_cols_fe),
            ('cat', cat_transformer, cat_cols_fe)
        ])
    main_pipeline = Pipeline(steps=[('transformer', transformer),
                      ('classifier', xgb)])
    
    aucs = []
    for i in range(len(data_skf['train'])):
        model = clone(main_pipeline)
        model.fit(data_skf['train'][i][0],data_skf['train'][i][1].values)
        pred_proba = model.predict_proba(data_skf['val'][i][0])[:,1]
        
        aucs.append(roc_auc_score(data_skf['val'][i][1].values, pred_proba,average='weighted'))
    return np.mean(aucs)

xgbBO = BayesianOptimization(search_model_xgb, {'num_leaves': (25, 100),
                                        'max_depth': (5, 100),
                                        'n_estimators':(50,300),
                                        'reg_alpha': (0.0, 0.05),
                                        'reg_lambda': (0.0, 0.05),
#                                          'scale_pos_weight':(1,9)
                                        },random_state=0)
xgbBO.maximize(n_iter=100, init_points=2)
        
        

|   iter    |  target   | max_depth | n_esti... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.5438  [0m | [0m 57.14   [0m | [0m 228.8   [0m | [0m 70.21   [0m | [0m 0.02724 [0m | [0m 0.02118 [0m |
| [95m 2       [0m | [95m 0.5543  [0m | [95m 66.36   [0m | [95m 159.4   [0m | [95m 91.88   [0m | [95m 0.04818 [0m | [95m 0.01917 [0m |
| [0m 3       [0m | [0m 0.5467  [0m | [0m 69.46   [0m | [0m 158.9   [0m | [0m 91.43   [0m | [0m 0.01489 [0m | [0m 0.01442 [0m |
| [0m 4       [0m | [0m 0.5482  [0m | [0m 69.15   [0m | [0m 105.8   [0m | [0m 80.83   [0m | [0m 0.02445 [0m | [0m 0.0194  [0m |
| [0m 5       [0m | [0m 0.5421  [0m | [0m 8.968   [0m | [0m 54.52   [0m | [0m 32.94   [0m | [0m 0.00111 [0m | [0m 0.003873[0m |
| [0m 6       [0m | [0m 0.5453  [0m | [0m 75.28   [0m | [0m 89.26   [0m | [0m 93.39   [0m | [0m 0.03

| [0m 58      [0m | [0m 0.5487  [0m | [0m 25.28   [0m | [0m 240.6   [0m | [0m 41.54   [0m | [0m 0.004665[0m | [0m 0.02199 [0m |
| [95m 59      [0m | [95m 0.5553  [0m | [95m 74.62   [0m | [95m 138.6   [0m | [95m 42.67   [0m | [95m 0.0249  [0m | [95m 0.02317 [0m |
| [0m 60      [0m | [0m 0.5459  [0m | [0m 27.67   [0m | [0m 202.6   [0m | [0m 43.5    [0m | [0m 0.009518[0m | [0m 0.03645 [0m |
| [0m 61      [0m | [0m 0.5495  [0m | [0m 25.59   [0m | [0m 244.0   [0m | [0m 99.6    [0m | [0m 0.01085 [0m | [0m 0.02173 [0m |
| [0m 62      [0m | [0m 0.5544  [0m | [0m 22.98   [0m | [0m 81.21   [0m | [0m 47.78   [0m | [0m 0.01837 [0m | [0m 0.0465  [0m |
| [0m 63      [0m | [0m 0.5448  [0m | [0m 66.12   [0m | [0m 163.5   [0m | [0m 93.86   [0m | [0m 0.045   [0m | [0m 0.004859[0m |
| [0m 64      [0m | [0m 0.5496  [0m | [0m 65.44   [0m | [0m 62.93   [0m | [0m 94.63   [0m | [0m 0.01357 [0m | [0m 0.04303 [0m |

In [83]:
xgbBO.max


{'target': 0.5559541550075702,
 'params': {'max_depth': 52.3160713621035,
  'n_estimators': 278.30132378597926,
  'num_leaves': 96.9938470728089,
  'reg_alpha': 0.04800396971863674,
  'reg_lambda': 0.047213768862403666,
  'scale_pos_weight': 3.8022455273872264}}

In [87]:
params = {
    'boosting_type':'gbdt',
     'max_depth': int(52.3160713621035),
      'n_estimators': int(278.30132378597926),
      'num_leaves': int(96.9938470728089),
      'reg_alpha': 0.04800396971863674,
      'reg_lambda': 0.047213768862403666,
      'scale_pos_weight': 3.8022455273872264,
     'random_state':0,
        'use_label_encoder':False,
}

In [88]:

xgb = XGBClassifier(**params)
transformer = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols_fe),
        ('cat', cat_transformer, cat_cols_fe)
    ])
main_pipeline = Pipeline(steps=[('transformer', transformer),
                  ('classifier', xgb)])

aucs = []
for i in range(len(data_skf['train'])):
    model = clone(main_pipeline)
    model.fit(data_skf['train'][i][0],data_skf['train'][i][1].values)
    pred_proba = model.predict_proba(data_skf['val'][i][0])[:,1]

    aucs.append(roc_auc_score(data_skf['val'][i][1].values, pred_proba,average='weighted'))

np.mean(aucs)

0.5559541550075702