# Library

In [1]:
import pandas as pd
import numpy as np
import os

import seaborn as sns
import matplotlib.pyplot as plt
import random


import os
import time
import datetime
import calendar

from sklearn.metrics import mean_squared_error


import sklearn
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import KFold, StratifiedKFold


from sklearn import clone
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# from utils_model import * # expand later

In [2]:
# np.set_seed(0)

# Preprocess

In [27]:
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

data['gender'] = data['gender'].astype('str')
data_test['gender'] = data_test['gender'].astype('str')

data['Achievement_above_100%_during3quartal'] = data['Achievement_above_100%_during3quartal'].astype(str)
data_test['Achievement_above_100%_during3quartal'] = data_test['Achievement_above_100%_during3quartal'].astype(str)


data = data.rename(columns={'annual leave':'annual_leave'})
data_test = data_test.rename(columns={'annual leave':'annual_leave'})

data = data.rename(columns={'Last_achievement_%':'Last_achievement'})
data_test = data_test.rename(columns={'Last_achievement_%':'Last_achievement'})


In [28]:
# get test data (for final evaluation)
X = data.drop(columns=['Best Performance'])
y = data['Best Performance']
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

In [29]:
# len(X_train),len(X_val)

In [30]:
num_cols = list(X.select_dtypes(exclude=['object']))
cat_cols = list(X.select_dtypes(include=['object']))
features = list(X.columns)
len(num_cols),len(cat_cols),len(features)

(14, 7, 21)

# FE

In [11]:
class Feature_Engineering:
    def __init__(self,parameters):
        self.parameters = parameters
        self.target = parameters['target']
    
    @staticmethod  
    def check_col(col):
        if len(col.split(' '))>1:
            col2 = '_'.join(col.split(' '))
        else:
            col2 = col
        return col2
    
    @staticmethod
    def get_bin(data,col,n_bin,mode='cut'):
        while True:
            try:
                if mode=='cut':
                    _,bin_dummy = pd.cut(data[col],n_bin,retbins=True)
                else:
                    _,bin_dummy = pd.qcut(data[col],n_bin,retbins=True)
            except:
                n_bin -= 1
                continue
            break
        return bin_dummy
        
    def fit(self,data_ori):
        target = self.target
        data = data_ori.copy()
        for param in self.parameters['bin_numer_qcut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='qcut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_numer_qcut',bin_dummy)
        for param in self.parameters['bin_numer_cut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='cut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_numer_cut',bin_dummy)
            
            
        for param in self.parameters['bin_add_categ_numer_bin_qcut']:
            col = param[1]
            n_bin = param[2]
            bin_dummy = self.get_bin(data,col,n_bin,mode='qcut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_qcut_add_categ',bin_dummy)
        
        for param in self.parameters['bin_target_encoding_cut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='cut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_cut',bin_dummy)
            
            data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
            data_dummy = data.groupby([f'{col}_bin_target_encoding_cut'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_bin_target_encoding_cut',data_dummy)
            
        for param in self.parameters['bin_target_encoding_qcut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='qcut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_qcut',bin_dummy)
            
            data[f'{col}_bin_target_encoding_qcut'] = pd.cut(data[col],bins=bin_dummy)
            data_dummy = data.groupby([f'{col}_bin_target_encoding_qcut'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_bin_target_encoding_qcut',data_dummy)
           
        for param in self.parameters['bin_target_encoding_custom_bin']:
            col = param[0]
            bins = param[1]
            setattr(self,f'{col}_bin_custom_bin',bins)
            
            data[f'{col}_bin_target_encoding_custom_bin'] = pd.cut(data[col],bins=bins)
            data_dummy = data.groupby([f'{col}_bin_target_encoding_custom_bin'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_bin_target_encoding_custom_bin',data_dummy)
        
        for param in self.parameters['categorical_mean_encoding']:
            col = param
            data[f'{col}_categorical_mean_encoding'] = data[col].copy().values
            data_dummy = data.groupby([f'{col}_categorical_mean_encoding'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_categorical_mean_encoding',data_dummy)
        
        
        
        self.fit = True
        return data
        
    
    def transform(self,X,mode='train'):
        data = X.copy()
        target = self.target
        if mode!='train':
            target_encode = self.target
        else:
            target_encode = self.target +"_y"
            
        if self.fit==False:
            raise Exception("Fit to train data first")
        
        for param in self.parameters['bin_numer_qcut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_numer_qcut')
            data[f'{col}_bin_numer_qcut'] = pd.cut(data[col],bins=bin_dummy).astype(str).values
        for param in self.parameters['bin_numer_cut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_numer_cut')
            data[f'{col}_bin_numer_cut'] = pd.cut(data[col],bins=bin_dummy).astype(str).values
            
        for cols in self.parameters['bin_add_categ_numer_bin_qcut']:
            col_add = cols[0] + '_' + cols[1]
            bin_dummy = eval(f'self.{cols[1]}_bin_qcut_add_categ')
            data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
            data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = (data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)).values
        
        for param in self.parameters['bin_target_encoding_cut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_cut')
            data_dummy = eval(f'self.{col}_bin_target_encoding_cut')
            data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy).values
            data[f'{col}_bin_target_encoding_cut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_cut'])[f'{target_encode}'].values
        
        for param in self.parameters['bin_target_encoding_qcut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_qcut')
            data_dummy = eval(f'self.{col}_bin_target_encoding_qcut')
            data[f'{col}_bin_target_encoding_qcut'] = pd.cut(data[col],bins=bin_dummy).values
            data[f'{col}_bin_target_encoding_qcut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_qcut'])[f'{target_encode}'].values
        
        for param in self.parameters['bin_target_encoding_custom_bin']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_custom_bin')
            data_dummy = eval(f'self.{col}_bin_target_encoding_custom_bin')
            data[f'{col}_bin_target_encoding_custom_bin'] = pd.cut(data[col],bins=bin_dummy).values
            data[f'{col}_bin_target_encoding_custom_bin'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_custom_bin'])[f'{target_encode}'].values
        
        for param in self.parameters['categorical_mean_encoding']:
            col = param
            data_dummy = eval(f'self.{col}_categorical_mean_encoding')
            data[f'{col}_categorical_mean_encoding'] = data[col].copy().values
            data[f'{col}_categorical_mean_encoding'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_categorical_mean_encoding'])[f'{target_encode}'].values
        
        
        for cols in self.parameters['multiply']:
            data[cols[0] + 'x' +cols[1]] = (data[cols[0]] * data[cols[1]]).values
        for cols in self.parameters['add']:
            data[cols[0] + '+' +cols[1]] = (data[cols[0]] + data[cols[1]]).values
        for cols in self.parameters['add_str']:
            data[cols[0] + '+' +cols[1]] = (data[cols[0]].astype(str)+'_' + data[cols[1]].astype(str)).values
            
        for cols in self.parameters['substract']:
            data[cols[0] + '-' +cols[1]] = (data[cols[0]] - data[cols[1]]).values
        for cols in self.parameters['divide']:
            data[cols[0] + '/' +cols[1]] = (data[cols[0]] / np.where(data[cols[1]]==0,0.0001,data[cols[1]])).values
        
        
        
        
        return data
    
    

In [14]:
num_cols

['job_duration_in_current_job_level',
 'job_duration_in_current_person_level',
 'job_duration_in_current_branch',
 'age',
 'number_of_dependences',
 'GPA',
 'year_graduated',
 'job_duration_from_training',
 'branch_rotation',
 'job_rotation',
 'assign_of_otherposition',
 'annual_leave',
 'sick_leaves',
 'Last_achievement']

In [15]:
cat_cols

['job_level',
 'person_level',
 'Employee_type',
 'gender',
 'marital_status_maried(Y/N)',
 'Education_level',
 'Achievement_above_100%_during3quartal']

In [12]:


parameters = {'multiply':[['GPA','number_of_dependences']],
              'add':[['annual_leave','sick_leaves'],['assign_of_otherposition','branch_rotation']],
              'add_str':[['Education_level','job_level']],
              'substract':[],'divide':[],
              'bin_numer_qcut':[['GPA',10]],
              'bin_numer_cut':[],
              'bin_add_categ_numer_bin_qcut':[['job_level','GPA',5],['Education_level','GPA',5]],
            'bin_target_encoding_cut':[],
             'bin_target_encoding_qcut':[['year_graduated',5],['GPA',5],['annual_leave',5]],
             'bin_target_encoding_custom_bin':[],
              'categorical_mean_encoding':['job_level','person_level','Employee_type','Education_level'],
             'target':'Best Performance'
             
            
            }
data2 = data.loc[X_train.index].copy()
data2 = data2.rename(columns={'annual leave':'annual_leave'})

add_fe = Feature_Engineering(parameters)
add_fe.fit(data2)
data2_fe = add_fe.transform(data2)




In [13]:
data2_fe.iloc[:,-15:]

Unnamed: 0,Best Performance,GPA_bin_numer_qcut,job_level_GPA_bin_add_categ_numer_bin_qcut,Education_level_GPA_bin_add_categ_numer_bin_qcut,year_graduated_bin_target_encoding_qcut,GPA_bin_target_encoding_qcut,annual_leave_bin_target_encoding_qcut,job_level_categorical_mean_encoding,person_level_categorical_mean_encoding,Employee_type_categorical_mean_encoding,Education_level_categorical_mean_encoding,GPAxnumber_of_dependences,annual_leave+sick_leaves,assign_of_otherposition+branch_rotation,Education_level+job_level
0,0,"(2.66, 3.0]","JG04_(2.66, 3.0]","level_0_(2.66, 3.0]",0.149194,0.148230,0.147887,0.146259,0.147281,0.141348,0.000000,2.86,3,4,level_0_JG04
1,0,"(3.32, inf]","JG04_(3.32, inf]","level_1_(3.32, inf]",0.146625,0.137610,0.154563,0.146259,0.147281,0.141348,0.192857,0.00,1,4,level_1_JG04
2,0,"(-0.001, 2.66]","JG05_(-0.001, 2.66]","level_1_(-0.001, 2.66]",0.146625,0.156013,0.154563,0.150568,0.187831,0.141975,0.192857,0.00,3,16,level_1_JG05
3,1,"(-0.001, 2.66]","JG05_(-0.001, 2.66]","level_1_(-0.001, 2.66]",0.146625,0.156013,0.143847,0.150568,0.187831,0.141348,0.192857,0.00,14,11,level_1_JG05
4,0,"(3.32, inf]","JG05_(3.32, inf]","level_1_(3.32, inf]",0.146625,0.137610,0.147887,0.150568,0.187831,0.141348,0.192857,21.60,3,11,level_1_JG05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11148,0,"(3.32, inf]","JG04_(3.32, inf]","level_5_(3.32, inf]",0.160437,0.137610,0.143847,0.146259,0.147281,0.141348,0.139535,3.33,5,3,level_5_JG04
11149,0,"(2.66, 3.0]","JG04_(2.66, 3.0]","level_5_(2.66, 3.0]",0.160437,0.148230,0.132110,0.146259,0.147281,0.159172,0.139535,6.00,6,1,level_5_JG04
11150,0,"(3.32, inf]","JG04_(3.32, inf]","level_5_(3.32, inf]",0.160437,0.137610,0.143847,0.146259,0.147281,0.159172,0.139535,0.00,4,1,level_5_JG04
11151,0,"(3.32, inf]","JG04_(3.32, inf]","level_5_(3.32, inf]",0.130695,0.137610,0.143847,0.146259,0.147281,0.141348,0.139535,3.53,5,5,level_5_JG04


In [24]:
data2_fe.isnull().sum()

job_level                                           0
job_duration_in_current_job_level                   0
person_level                                        0
job_duration_in_current_person_level                0
job_duration_in_current_branch                      0
Employee_type                                       0
gender                                              0
age                                                 0
marital_status_maried(Y/N)                          0
number_of_dependences                               0
Education_level                                     0
GPA                                                 0
year_graduated                                      0
job_duration_from_training                          0
branch_rotation                                     0
job_rotation                                        0
assign_of_otherposition                             0
annual_leave                                        0
sick_leaves                 

In [25]:
data2_fe.shape

(11153, 36)

## Model with FE

In [141]:
def fast_build_model_FE(X,y,cv,Feature_Engineering,parameters,model_base=LogisticRegression(class_weight='balanced')):

    num_transformer = Pipeline(steps=[
                                    ('imputer', SimpleImputer(strategy = 'median')),
                                    ('scaler', RobustScaler())
                                    ])

    cat_transformer = Pipeline(steps=[
                                    ('imputer', SimpleImputer(strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                    ])

    


    
    
    skf = StratifiedKFold(n_splits=2,random_state = 3,shuffle = True)


    # oof validation
    oof_y_valid = []
    oof_y_valid_pred = []
    oof_y_valid_pred_proba = []
    pipelines = []
    add_fes = []
    data = pd.concat([X,y],axis=1)
    aucs=[]
#     print(data.columns)
    for cv,(train_index, val_index) in enumerate(skf.split(X,y)):
        start_fit = time.time()
        data_train = data.iloc[train_index,:].copy()
#         data_val = data.iloc[val_index,:][features]
        
        add_fe = Feature_Engineering(parameters)
        add_fe.fit(data_train)
        
        X_train = add_fe.transform(data_train).drop(columns=[parameters['target']])
        num_cols_fe = list(X_train.select_dtypes(exclude='object').columns)
        cat_cols_fe = list(X_train.select_dtypes(include='object').columns)
        
        print(X_train.shape,data.shape)
        y_train = y.iloc[train_index]
        
        X_val = add_fe.transform(X.iloc[val_index,:],mode='val')
        y_val = y.iloc[val_index]
        print(X_val.shape,data.shape)
        
        transformer = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_cols_fe),
            ('cat', cat_transformer, cat_cols_fe)
        ])
        
        main_pipeline = Pipeline(steps=[('transformer', transformer),
                          ('classifier', model_base)])
        

        
        add_fes.append(add_fe)
        model = clone(main_pipeline)
        model.fit(X_train,y_train.values.ravel())
        pred = model.predict(X_val)
        pred_proba = model.predict_proba(X_val)[:,1]
        oof_y_valid_pred.extend(pred)
        oof_y_valid_pred_proba.extend(pred_proba)
        oof_y_valid.extend(y_val.values)
        aucs.append(roc_auc_score(y_val.values, pred_proba,average='weighted'))
        pipelines.append(model)
        print(f'Fit iteration {cv} done in : {str(time.time()-start_fit)}')

    prec,rec,f1, _ = precision_recall_fscore_support(oof_y_valid,oof_y_valid_pred)
    auc = roc_auc_score(oof_y_valid, oof_y_valid_pred_proba,average='weighted')
    print(f'PRec Rec AUC average : {prec} {rec} <==> {auc}')
    print(aucs)
    print(np.mean(aucs[:2]))
    return add_fes,pipelines


def fast_predict_FE(data,add_fes,pipelines):
    X = data.copy()
#     pred = np.zeros(1,len(X))
    pred_proba = np.zeros((len(X)))
    for i in range(len(pipelines)):
        
        pred_proba += pipelines[i].predict_proba(add_fes[i].transform(X,mode='test'))[:,1] / len(pipelines)
    
    return pred_proba
        

In [99]:
num_cols

['job_duration_in_current_job_level',
 'job_duration_in_current_person_level',
 'job_duration_in_current_branch',
 'age',
 'number_of_dependences',
 'GPA',
 'year_graduated',
 'job_duration_from_training',
 'branch_rotation',
 'job_rotation',
 'assign_of_otherposition',
 'annual_leave',
 'sick_leaves',
 'Last_achievement']

In [100]:
cat_cols

['job_level',
 'person_level',
 'Employee_type',
 'gender',
 'marital_status_maried(Y/N)',
 'Education_level',
 'Achievement_above_100%_during3quartal']

In [16]:
data['job_duration_in_current_branch'].describe()

count    11153.000000
mean         1.034646
std          0.416723
min          0.000000
25%          0.707107
50%          1.118034
75%          1.224745
max          2.677686
Name: job_duration_in_current_branch, dtype: float64

In [327]:
parameters = {'multiply':[['GPA','number_of_dependences']],
              'add':[['annual_leave','sick_leaves'],['assign_of_otherposition','branch_rotation']],
              'add_str':[['Education_level','job_level']],
              'substract':[],'divide':[],
              'bin_numer_qcut':[],
              'bin_numer_cut':[],
              'bin_add_categ_numer_bin_qcut':[['job_level','GPA',5],['Education_level','GPA',5]],
            'bin_target_encoding_cut':[],
             'bin_target_encoding_qcut':[['year_graduated',5],['GPA',5],['annual_leave',5]],
             'bin_target_encoding_custom_bin':[],
              'categorical_mean_encoding':['job_level','person_level','Employee_type','Education_level'],
             'target':'Best Performance'}


# parameters = {'multiply':[['GPA','number_of_dependences']],
#               'add':[],
#               'add_str':[],
#               'substract':[],
#               'divide':[],
#               'bin_numer_qcut':[],
#               'bin_numer_cut':[],
#               'bin_add_categ_numer_bin_qcut':[],
#             'bin_target_encoding_cut':[],
#              'bin_target_encoding_qcut':[['year_graduated',5],['GPA',5],['annual_leave',5]],
#              'bin_target_encoding_custom_bin':[],
#               'categorical_mean_encoding':[],
#              'target':'Best Performance'}


# parameters = {'multiply':[],
#               'add':[],
#               'add_str':[],
#               'substract':[],
#               'divide':[],
#               'bin_numer_qcut':[],
#               'bin_numer_cut':[],
#               'bin_add_categ_numer_bin_qcut':[],
#             'bin_target_encoding_cut':[],
#              'bin_target_encoding_qcut':[],
#              'bin_target_encoding_custom_bin':[],
#               'categorical_mean_encoding':[],
#              'target':'Best Performance'}


### XGB

In [321]:
# parameters = {'multiply':[['GPA','number_of_dependences']],
#               'add':[],
#               'add_str':[],
#               'substract':[],
#               'divide':[],
#               'bin_numer_qcut':[['year_graduated',10]],
#               'bin_numer_cut':[],
#               'bin_add_categ_numer_bin_qcut':[['job_level','year_graduated',10],['job_level','GPA',10]],
#             'bin_target_encoding_cut':[['GPA',10]],
#              'bin_target_encoding_qcut':[],
#              'bin_target_encoding_custom_bin':[],
#               'categorical_mean_encoding':['Education_level'],
#              'target':'Best Performance'}

In [328]:
cv=3
add_fes,pipelines = fast_build_model_FE(X_train,y_train,cv,
                    Feature_Engineering,parameters,model_base=XGBClassifier(scale_pos_weight=3,random_state=0,n_estimators=100))



Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.



GPA
GPA
year_graduated
GPA
annual_leave
(7435, 34) (11153, 22)
(3718, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_categ_numer_bin_qcut', 'Education_level_GPA_bin



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 0 done in : 1.6581380367279053
GPA
GPA
year_graduated
GPA
annual_leave
(7435, 34) (11153, 22)
(3718, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_ca



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 1 done in : 1.544999361038208
GPA
GPA
year_graduated
GPA
annual_leave
(7436, 34) (11153, 22)
(3717, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_cat



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 2 done in : 1.4361348152160645
PRec Rec AUC average : [0.85712914 0.20189702] [0.93810425 0.09102016] <==> 0.5471945715706794
[0.5285005820157144, 0.5573888858094407, 0.5571277346506704]
0.5429447339125775


0.5542127485894572 Not yet deployed

In [150]:
params = {
    'boosting_type':'gbdt',
     'max_depth': int(52.3160713621035),
      'n_estimators': int(278.30132378597926),
      'num_leaves': int(96.9938470728089),
      'reg_alpha': 0.04800396971863674,
      'reg_lambda': 0.047213768862403666,
      'scale_pos_weight': 3.8022455273872264,
     'random_state':0,
        'use_label_encoder':False,
    'verbosity':0
}
cv=2
add_fes,pipelines = fast_build_model_FE(X_train,y_train,cv,
                    Feature_Engineering,parameters,model_base=XGBClassifier(**params))


(5576, 35) (11153, 22)
(5577, 35) (11153, 22)
Fit iteration 0 done in : 4.034760475158691
(5577, 35) (11153, 22)
(5576, 35) (11153, 22)
Fit iteration 1 done in : 3.8839995861053467
PRec Rec AUC average : [0.85617077 0.22117647] [0.96521648 0.05742211] <==> 0.555872943180543
[0.552213840990638, 0.5596944690245023]
0.5559541550075702


In [278]:
# pred_proba = fast_predict_FE(data_test,add_fes,pipelines)

# df_submission = pd.DataFrame({'index':data_test.index,'Best Performance':pred_proba})
# df_submission

# df_submission.to_csv('df_submission_14feb_XGB10CV_05702_FE.csv',index=False)

In [40]:
cv=5
add_fes_achiv_str,pipelines_achiv_str = fast_build_model_FE(X_train,y_train,cv,
                    Feature_Engineering,parameters,model_base=XGBClassifier(scale_pos_weight=3,random_state=0))



GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_categ_numer_bin_qcut', 'Education_level_GPA_bin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Fit iteration 0 done in : 1.4946510791778564
GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_ca

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Fit iteration 1 done in : 1.6289997100830078
GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_ca

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Fit iteration 2 done in : 1.5420007705688477
GPA
GPA
year_graduated
GPA
annual_leave
(8923, 34) (11153, 22)
(2230, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_ca

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Fit iteration 3 done in : 1.596224308013916
GPA
GPA
year_graduated
GPA
annual_leave
(8923, 34) (11153, 22)
(2230, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_cat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Fit iteration 4 done in : 1.5029981136322021
PRec Rec AUC average : [0.85862966 0.21907216] [0.93631778 0.1038485 ] <==> 0.552193097668127
[0.528619934212217, 0.5741912641144278, 0.5637071760891019, 0.5402928901894803, 0.5605891871999948]
0.5514055991633224


In [41]:
b = fast_predict_FE(data_test,add_fes_achiv_str,pipelines_achiv_str)

In [152]:
cv=5
add_fes,pipelines = fast_build_model_FE(X_train,y_train,cv,
                    Feature_Engineering,parameters,model_base=XGBClassifier(scale_pos_weight=3,random_state=0))



GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_categ_numer_bin_qcut', 'Education_level_GPA_bin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Fit iteration 0 done in : 1.2809991836547852
GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_ca

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Fit iteration 1 done in : 1.3779993057250977
GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_ca

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Fit iteration 2 done in : 1.3370018005371094
GPA
GPA
year_graduated
GPA
annual_leave
(8923, 34) (11153, 22)
(2230, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_ca

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Fit iteration 3 done in : 1.3499984741210938
GPA
GPA
year_graduated
GPA
annual_leave
(8923, 34) (11153, 22)
(2230, 34) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'year_graduated_bin_target_encoding_qcut', 'GPA_bin_target_encoding_qcut', 'annual_leave_bin_target_encoding_qcut', 'job_level_categorical_mean_encoding', 'person_level_categorical_mean_encoding', 'Employee_type_categorical_mean_encoding', 'Education_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'annual_leave+sick_leaves', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Achievement_above_100%_during3quartal', 'job_level_GPA_bin_add_ca

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Fit iteration 4 done in : 1.2589998245239258
PRec Rec AUC average : [0.85862966 0.21907216] [0.93631778 0.1038485 ] <==> 0.552193097668127
[0.528619934212217, 0.5741912641144278, 0.5637071760891019, 0.5402928901894803, 0.5605891871999948]
0.5514055991633224


In [75]:
(0.5375661732582941+0.5459191520449098)/2

0.541742662651602

In [130]:
pred_proba_XGB = fast_predict_FE(data_test,add_fes,pipelines)

In [24]:
df_submission = pd.DataFrame({'index':data_test.index,'Best Performance':pred_proba})
df_submission

Unnamed: 0,index,Best Performance
0,0,0.101839
1,1,0.208248
2,2,0.555681
3,3,0.094393
4,4,0.228342
...,...,...
5995,5995,0.232385
5996,5996,0.035990
5997,5997,0.129282
5998,5998,0.163177


In [25]:
# df_submission.to_csv('df_submission_10feb_XGBclassifier_FE.csv',index=False)

### Logreg

In [76]:
# parameters = {'multiply':[['GPA','number_of_dependences']],
#               'add':[['annual_leave','sick_leaves'],['assign_of_otherposition','branch_rotation']],
#               'add_str':[['Education_level','job_level']],
#               'substract':[],'divide':[],
#               'bin_numer_qcut':[],
#               'bin_numer_cut':[['GPA',30]],
#               'bin_add_categ_numer_bin_qcut':[['job_level','GPA',5],['Education_level','GPA',5]],
#             'bin_target_encoding_cut':[],
#              'bin_target_encoding_qcut':[['year_graduated',5],['GPA',5],['annual_leave',5]],
#              'bin_target_encoding_custom_bin':[],
#               'categorical_mean_encoding':['job_level','person_level','Employee_type','Education_level'],
#              'target':'Best Performance'}


parameters = {'multiply':[['GPA','number_of_dependences']],
              'add':[['assign_of_otherposition','branch_rotation']],
              'add_str':[['Education_level','job_level']],
              'substract':[],
              'divide':[],
              'bin_numer_qcut':[],
              'bin_numer_cut':[],
              'bin_add_categ_numer_bin_qcut':[],
            'bin_target_encoding_cut':[['GPA',20]],
             'bin_target_encoding_qcut':[],
             'bin_target_encoding_custom_bin':[],
              'categorical_mean_encoding':['person_level'],
             'target':'Best Performance'}

In [77]:
cv=3
add_fes,pipelines = fast_build_model_FE(X_train,y_train,cv,
                    Feature_Engineering,parameters,model_base=LogisticRegression(class_weight='balanced',random_state=0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_cut'])[f'{target_encode}'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

(7435, 26) (11153, 22)
(3718, 26) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'Achievement_above_100%_during3quartal', 'GPA_bin_target_encoding_cut', 'person_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Education_level+job_level']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_targ

Fit iteration 0 done in : 0.31099748611450195
(7435, 26) (11153, 22)
(3718, 26) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'Achievement_above_100%_during3quartal', 'GPA_bin_target_encoding_cut', 'person_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Education_level+job_level']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_targ

Fit iteration 1 done in : 0.31099939346313477
(7436, 26) (11153, 22)
(3717, 26) (11153, 22)
['job_duration_in_current_job_level', 'job_duration_in_current_person_level', 'job_duration_in_current_branch', 'age', 'number_of_dependences', 'GPA', 'year_graduated', 'job_duration_from_training', 'branch_rotation', 'job_rotation', 'assign_of_otherposition', 'annual_leave', 'sick_leaves', 'Last_achievement', 'Achievement_above_100%_during3quartal', 'GPA_bin_target_encoding_cut', 'person_level_categorical_mean_encoding', 'GPAxnumber_of_dependences', 'assign_of_otherposition+branch_rotation']
['job_level', 'person_level', 'Employee_type', 'gender', 'marital_status_maried(Y/N)', 'Education_level', 'Education_level+job_level']
Fit iteration 2 done in : 0.30699968338012695
PRec Rec AUC average : [0.86139748 0.15668386] [0.5531736 0.4825901] <==> 0.5310840014040591
[0.525867942482066, 0.5393622770671951, 0.5270804169510742]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
# cv=5
add_fes,pipelines = fast_build_model_FE(X_train,y_train,cv,
                    Feature_Engineering,parameters,model_base=LogisticRegression(class_weight='balanced'))



GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Fit iteration 0 done in : 0.6580028533935547
GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Fit iteration 1 done in : 0.6779980659484863
GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Fit iteration 2 done in : 0.6340024471282959
GPA
GPA
year_graduated
GPA
annual_leave
(8923, 34) (11153, 22)
(2230, 34) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Fit iteration 3 done in : 0.6349983215332031
GPA
GPA
year_graduated
GPA
annual_leave
(8923, 34) (11153, 22)
(2230, 34) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Fit iteration 4 done in : 0.6180019378662109
PRec Rec AUC average : [0.86192246 0.15570494] [0.51166456 0.52351863] <==> 0.5251128344301582


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### LGBM

In [128]:
parameters = {'multiply':[['GPA','number_of_dependences']],
              'add':[['annual_leave','sick_leaves'],['assign_of_otherposition','branch_rotation']],
              'add_str':[['Education_level','job_level']],
              'substract':[],'divide':[],
              'bin_numer_qcut':[],
              'bin_numer_cut':[['GPA',30]],
              'bin_add_categ_numer_bin_qcut':[['job_level','GPA',5],['Education_level','GPA',5]],
            'bin_target_encoding_cut':[],
             'bin_target_encoding_qcut':[['year_graduated',5],['GPA',5],['annual_leave',5]],
             'bin_target_encoding_custom_bin':[],
              'categorical_mean_encoding':['job_level','person_level','Employee_type','Education_level'],
             'target':'Best Performance'}


parameters = {'multiply':[],
              'add':[],
              'add_str':[],
              'substract':[],
              'divide':[],
              'bin_numer_qcut':[],
              'bin_numer_cut':[],
              'bin_add_categ_numer_bin_qcut':[],
            'bin_target_encoding_cut':[],
             'bin_target_encoding_qcut':[],
             'bin_target_encoding_custom_bin':[],
              'categorical_mean_encoding':[],
             'target':'Best Performance'}

In [143]:
parameters = {'multiply':[['GPA','number_of_dependences']],
              'add':[['annual_leave','sick_leaves'],['assign_of_otherposition','branch_rotation']],
              'add_str':[['Education_level','job_level']],
              'substract':[],'divide':[],
              'bin_numer_qcut':[],
              'bin_numer_cut':[['GPA',30]],
              'bin_add_categ_numer_bin_qcut':[['job_level','GPA',5],['Education_level','GPA',5]],
            'bin_target_encoding_cut':[],
             'bin_target_encoding_qcut':[['year_graduated',5],['GPA',5],['annual_leave',5]],
             'bin_target_encoding_custom_bin':[],
              'categorical_mean_encoding':['job_level','person_level','Employee_type','Education_level'],
             'target':'Best Performance'}


# parameters = {'multiply':[['GPA','number_of_dependences']],
#               'add':[['annual_leave','sick_leaves'],['assign_of_otherposition','branch_rotation']],
#               'add_str':[],
#               'substract':[],
#               'divide':[],
#               'bin_numer_qcut':[['GPA',30]],
#               'bin_numer_cut':[],
#               'bin_add_categ_numer_bin_qcut':[['job_level','GPA',10]],
#             'bin_target_encoding_cut':[['year_graduated',20]],
#              'bin_target_encoding_qcut':[],
#              'bin_target_encoding_custom_bin':[],
#               'categorical_mean_encoding':['Employee_type'],
#              'target':'Best Performance'}


# parameters = {'multiply':[['GPA','number_of_dependences']],
#               'add':[['assign_of_otherposition','branch_rotation']],
#               'add_str':[['Education_level','job_level']],
#               'substract':[],
#               'divide':[],
#               'bin_numer_qcut':[],
#               'bin_numer_cut':[['job_duration_in_current_person_level',5],['job_duration_in_current_branch',10],
#                               ],
#               'bin_add_categ_numer_bin_qcut':[],
#             'bin_target_encoding_cut':[],
#              'bin_target_encoding_qcut':[],
#              'bin_target_encoding_custom_bin':[],
#               'categorical_mean_encoding':['Employee_type','Education_level'],
#              'target':'Best Performance'}

In [144]:
# num_data.describe()

In [136]:
str
cv=2
add_fes,pipelines = fast_build_model_FE(X_train,y_train,cv,
                    Feature_Engineering,parameters,model_base=LGBMClassifier(scale_pos_weight=3,random_state=0))

GPA
GPA
year_graduated
GPA
annual_leave
(5576, 37) (11153, 22)
(5577, 37) (11153, 22)
Fit iteration 0 done in : 0.8711376190185547
GPA
GPA
year_graduated
GPA
annual_leave
(5577, 37) (11153, 22)
(5576, 37) (11153, 22)
Fit iteration 1 done in : 0.794001579284668
PRec Rec AUC average : [0.85744076 0.22056385] [0.9506095  0.08124618] <==> 0.5520841598357447
[0.5522217962318845, 0.5514768075592157]
0.5518493018955501


In [152]:
model_params = {
        'boosting_type':'gbdt',
         'num_leaves':int(60.80441469032743),
         'max_depth':int(5.663953027572844),
         'n_estimators':int(52.29955231632742),
         'objective':'binary',
         'class_weight':'balanced',
         'reg_alpha':0.0070860708219850025,
         'reg_lambda':0.04598746020415847,
         'random_state':0}
str
cv=2
add_fes,pipelines = fast_build_model_FE(X_train,y_train,cv,
                    Feature_Engineering,parameters,model_base=LGBMClassifier(**model_params))

(5576, 35) (11153, 22)
(5577, 35) (11153, 22)
Fit iteration 0 done in : 0.6199672222137451
(5577, 35) (11153, 22)
(5576, 35) (11153, 22)
Fit iteration 1 done in : 0.5829989910125732
PRec Rec AUC average : [0.8690032  0.17922719] [0.68526692 0.3995113 ] <==> 0.5640364760068436
[0.5613977820787404, 0.5668567724311442]
0.5641272772549423


In [153]:
pred_proba = fast_predict_FE(data_test.copy(),add_fes,pipelines)

df_submission = pd.DataFrame({'index':data_test.index,'Best Performance':pred_proba})
df_submission

df_submission.to_csv('df_submission_15feb_LGBM2CV0564TUNE_FE.csv',index=False)

In [27]:
cv=5
add_fes,pipelines = fast_build_model_FE(X_train,y_train,cv,
                    Feature_Engineering,parameters,model_base=LGBMClassifier(scale_pos_weight=3,random_state=0))



GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Fit iteration 0 done in : 0.7319755554199219
GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

(2231, 34) (11153, 22)
Fit iteration 1 done in : 0.6710000038146973
GPA
GPA
year_graduated
GPA
annual_leave
(8922, 34) (11153, 22)
(2231, 34) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Fit iteration 2 done in : 0.6429998874664307
GPA
GPA
year_graduated
GPA
annual_leave
(8923, 34) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

(2230, 34) (11153, 22)
Fit iteration 3 done in : 0.643000602722168
GPA
GPA
year_graduated
GPA
annual_leave
(8923, 34) (11153, 22)
(2230, 34) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Fit iteration 4 done in : 0.6579952239990234
PRec Rec AUC average : [0.85608267 0.20669291] [0.95765027 0.06414172] <==> 0.5499872189025177


# ensemble

In [269]:
class Ensemble_Models():
    def __init__(self,list_models):
        self.list_models = list_models
        
    def fit(self,X,y):
        
        num_transformer = Pipeline(steps=[
                                    ('imputer', SimpleImputer(strategy = 'median')),
                                    ('scaler', RobustScaler())
                                    ])

        cat_transformer = Pipeline(steps=[
                                        ('imputer', SimpleImputer(strategy='most_frequent')),
                                        ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                        ])
        num_cols_fe = list(X.select_dtypes(exclude='object').columns)
        cat_cols_fe = list(X.select_dtypes(include='object').columns)
        
        
        transformer = ColumnTransformer(
            transformers=[
                ('num', num_transformer, num_cols_fe),
                ('cat', cat_transformer, cat_cols_fe)
            ])
        pipelines=[]
        for mod in self.list_models:
            main_pipeline = Pipeline(steps=[('transformer', transformer),
                          ('classifier', mod)])
            model = clone(main_pipeline)
            model.fit(X,y)
            pipelines.append(model)
        
        self.pipelines = pipelines
        
    def predict_proba(self,X):
        predict_proba = np.zeros(len(X))
        for i,pipe in enumerate(self.pipelines):
            predict_proba += pipe.predict_proba(X)[:,1] / len(self.pipelines)
            
        return predict_proba
    
    def predict(self,X,threshold=0.5):
        predict_proba = np.zeros(len(X))
        for i,pipe in enumerate(self.pipelines):
            predict_proba += pipe.predict_proba(X)[:,1] / len(self.pipelines)
        
        predict = np.where(predict_proba>threshold,1,0)
        
        return predict
            
            
        

In [270]:
def fast_build_model_FE_ensemble(X,y,cv,Feature_Engineering,parameters,list_models=[]):

    
    skf = StratifiedKFold(n_splits=cv,random_state = 0)

    # oof validation
    oof_y_valid = []
    oof_y_valid_pred = []
    oof_y_valid_pred_proba = []
    pipelines = []
    add_fes = []
    data = pd.concat([X,y],axis=1)
    aucs = []
    
#     print(data.columns)
    for cv,(train_index, val_index) in enumerate(skf.split(X,y)):
        start_fit = time.time()
        data_train = data.iloc[train_index,:].copy()
#         data_val = data.iloc[val_index,:][features]
        
        add_fe = Feature_Engineering(parameters)
        add_fe.fit(data_train)
        
        X_train = add_fe.transform(data_train).drop(columns=[parameters['target']])
#         X_train = data_train.drop(columns=[parameters['target']])
    
        print(X_train.shape,data.shape)
        y_train = y.iloc[train_index]
        
        X_val = add_fe.transform(X.iloc[val_index,:],mode='val')
#         X_val = X.iloc[val_index,:]
    
        y_val = y.iloc[val_index]
        print(X_val.shape,data.shape)
        
        model = Ensemble_Models(list_models)

        
        add_fes.append(add_fe)
        model.fit(X_train,y_train.values.ravel())
        pred = model.predict(X_val)
        pred_proba = model.predict_proba(X_val)
        
        oof_y_valid_pred.extend(pred)
        oof_y_valid_pred_proba.extend(pred_proba)
        oof_y_valid.extend(y_val.values)

        pipelines.append(model)
        
        
        
        aucs.append(roc_auc_score(y_val.values, pred_proba,average='micro'))
        
        
        print(f'Fit iteration {cv} done in : {str(time.time()-start_fit)}')

    prec,rec,f1, _ = precision_recall_fscore_support(oof_y_valid,oof_y_valid_pred)
    auc = roc_auc_score(oof_y_valid, oof_y_valid_pred_proba,average='micro')
    print(f'PRec Rec AUC average : {prec} {rec} <==> {auc}')
    print(aucs)
    print(np.mean(aucs[:2]))
    
    return add_fes,pipelines


def fast_predict_FE_ensemble(X,add_fes,pipelines):
    data = X.copy()
#     pred = np.zeros(1,len(X))
    pred_proba = np.zeros((len(X)))
    for i in range(len(pipelines)):
        
        pred_proba += pipelines[i].predict_proba(add_fes[i].transform(data,mode='test')) / len(pipelines)
    
    return pred_proba
        

In [283]:
list_models = [
#     LogisticRegression(class_weight='balanced',random_state = 0),
    XGBClassifier(scale_pos_weight=3,random_state = 0),
    LGBMClassifier(scale_pos_weight=3,random_state = 0),
    RandomForestClassifier(class_weight='balanced',random_state = 0),
#     MLPClassifier(hidden_layer_sizes=(100,),activation='identity',random_state = 0)
]



In [284]:
parameters = {'multiply':[['GPA','number_of_dependences']],
              'add':[['annual_leave','sick_leaves'],['assign_of_otherposition','branch_rotation']],
              'add_str':[['Education_level','job_level']],
              'substract':[],'divide':[],
              'bin_numer_qcut':[],
              'bin_numer_cut':[],
              'bin_add_categ_numer_bin_qcut':[['job_level','GPA',5],['Education_level','GPA',5]],
            'bin_target_encoding_cut':[],
             'bin_target_encoding_qcut':[['year_graduated',5],['GPA',5],['annual_leave',5]],
             'bin_target_encoding_custom_bin':[],
              'categorical_mean_encoding':['job_level','person_level','Employee_type','Education_level'],
             'target':'Best Performance'}

# parameters = {'multiply':[],
#               'add':[],
#               'add_str':[],
#               'substract':[],'divide':[],
#               'bin_numer_qcut':[],
#               'bin_numer_cut':[],
#               'bin_add_categ_numer_bin_qcut':[],
#             'bin_target_encoding_cut':[['GPA',30]],
#              'bin_target_encoding_qcut':[],
#              'bin_target_encoding_custom_bin':[],
#               'categorical_mean_encoding':[],
#              'target':'Best Performance'}

In [285]:
cv=10
add_fes_str,pipelines_str = fast_build_model_FE_ensemble(X_train,y_train,cv,
                    Feature_Engineering,parameters,list_models=list_models)


Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.



GPA
GPA
year_graduated
GPA
annual_leave
(9033, 34) (10037, 22)
(1004, 34) (10037, 22)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 0 done in : 3.4611732959747314
GPA
GPA
year_graduated
GPA
annual_leave
(9033, 34) (10037, 22)
(1004, 34) (10037, 22)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 1 done in : 3.547661542892456
GPA
GPA
year_graduated
GPA
annual_leave
(9033, 34) (10037, 22)
(1004, 34) (10037, 22)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 2 done in : 3.696000337600708
GPA
GPA
year_graduated
GPA
annual_leave
(9033, 34) (10037, 22)
(1004, 34) (10037, 22)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 3 done in : 3.838998556137085
GPA
GPA
year_graduated
GPA
annual_leave
(9033, 34) (10037, 22)
(1004, 34) (10037, 22)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 4 done in : 3.9938435554504395
GPA
GPA
year_graduated
GPA
annual_leave
(9033, 34) (10037, 22)
(1004, 34) (10037, 22)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 5 done in : 4.347000360488892
GPA
GPA
year_graduated
GPA
annual_leave
(9033, 34) (10037, 22)
(1004, 34) (10037, 22)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 6 done in : 4.32400107383728
GPA
GPA
year_graduated
GPA
annual_leave
(9034, 34) (10037, 22)
(1003, 34) (10037, 22)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 7 done in : 4.467153310775757
GPA
GPA
year_graduated
GPA
annual_leave
(9034, 34) (10037, 22)
(1003, 34) (10037, 22)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 8 done in : 4.683999061584473
GPA
GPA
year_graduated
GPA
annual_leave
(9034, 34) (10037, 22)
(1003, 34) (10037, 22)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Fit iteration 9 done in : 5.395112752914429
PRec Rec AUC average : [0.85593476 0.25961538] [0.99102459 0.01851852] <==> 0.5718078774357457
[0.5900708880160935, 0.5850416706581091, 0.5772583580802758, 0.6289555193664783, 0.5740332726634096, 0.5449436408340518, 0.5569259507615671, 0.533936178763765, 0.5465798569246845, 0.579634276945701]
0.5875562793371013


In [286]:
pred_proba = fast_predict_FE_ensemble(X_val,add_fes_str,pipelines_str)
roc_auc_score(y_val.values, pred_proba,average='micro')


Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.


Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.


Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.


Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.


Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.


Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.


Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.


Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.


Given feature/column names or counts do not mat

0.5762417796008895

In [136]:
pred_proba = fast_predict_FE_ensemble(data_test,add_fes_str,pipelines_str)

# df_submission = pd.DataFrame({'index':data_test.index,'Best Performance':pred_proba})
# df_submission

# df_submission.to_csv('df_submission_13feb_ENSEMBLESTR5CV_FE.csv',index=False)

In [138]:
(pred_proba_ENSE==pred_proba_XGB).all()

False

In [148]:
(pred_proba_ENSE.round(4)==pred_proba_XGB.round(4)).all()

True

In [145]:
pred_proba_XGB[0]

0.17933059739880264

In [146]:
pred_proba_ENSE[0]

0.17933059819042685

In [192]:
cv=5
add_fes,pipelines = fast_build_model_FE_ensemble(X_train,y_train,cv,
                    Feature_Engineering,parameters,list_models=list_models)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_cut'])[f'{target_encode}'].values


(8922, 22) (11153, 22)
(2231, 22) (11153, 22)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fit iteration 0 done in : 3.1989974975585938
(8922, 22) (11153, 22)
(2231, 22) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_cut'])[f'{target_encode}'].values
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver option

Fit iteration 1 done in : 3.2701711654663086
(8922, 22) (11153, 22)
(2231, 22) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_cut'])[f'{target_encode}'].values
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver option

Fit iteration 2 done in : 3.7990024089813232
(8923, 22) (11153, 22)
(2230, 22) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_cut'])[f'{target_encode}'].values
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver option

Fit iteration 3 done in : 3.40883731842041
(8923, 22) (11153, 22)
(2230, 22) (11153, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{col}_bin_target_encoding_cut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_cut'])[f'{target_encode}'].values
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver option

Fit iteration 4 done in : 3.3530004024505615
PRec Rec AUC average : [0.85363661 0.3       ] [0.99779319 0.00549786] <==> 0.5593385721068307
[0.5534365122195667, 0.574695922997065, 0.5622765082091179, 0.5345414692076409, 0.5722752261438161]
0.5640662176083158


0.5548748462799754

In [117]:
# import pickle
# with open("pipelines_0567_ensembleFE.pkl", "wb") as f:
#     pickle.dump(pipelines, f)

In [121]:
# pipelines[0].pipelines[0]['transformer'].transform(data2_fe.drop(columns=['Best Performance']).copy()).shape

In [122]:
pipelines[0].pipelines[0]['transformer']

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', RobustScaler())]),
                                 ['job_duration_in_current_job_level',
                                  'job_duration_in_current_person_level',
                                  'job_duration_in_current_branch', 'age',
                                  'number_of_dependences', 'GPA',
                                  'year_graduated',
                                  'job_duration_from_training',
                                  'branch_rotation', 'job_rotation',
                                  'assign_of_otherposition', 'annual_leave',
                                  'sick_leaves', 'Last_achievement',
                                  'Achievement_above_100%_during3quartal',
                                  'GPA_bin_

In [123]:
data2_fe.shape

(11153, 36)

In [124]:
# cv=10
# add_fes,pipelines = fast_build_model_FE_ensemble(X_train,y_train,cv,
#                     Feature_Engineering,parameters,list_models=list_models)

In [125]:
len(add_fes)

5

In [126]:
pred_proba = fast_predict_FE_ensemble(data_test,add_fes,pipelines)

In [127]:
df_submission = pd.DataFrame({'index':data_test.index,'Best Performance':pred_proba})
df_submission

Unnamed: 0,index,Best Performance
0,0,0.230511
1,1,0.273958
2,2,0.386696
3,3,0.220155
4,4,0.253016
...,...,...
5995,5995,0.240473
5996,5996,0.168555
5997,5997,0.247632
5998,5998,0.196402


In [45]:
# df_submission.to_csv('df_submission_12feb_Ensemble_FE.csv',index=False)

In [14]:
15+22

37

# Embeding

In [15]:
cat_cols

['job_level',
 'person_level',
 'Employee_type',
 'gender',
 'marital_status_maried(Y/N)',
 'Education_level',
 'Achievement_above_100%_during3quartal']

In [19]:
import pandas as pd
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras.layers import Embedding, Dense, Flatten
import os

In [21]:
for col in cat_cols:
    print(pd.crosstab(data[col],data['Best Performance']))
    
    print('===========\n')

Best Performance     0     1
job_level                   
JG03                28     8
JG04              8890  1523
JG05               598   106

Best Performance     0     1
person_level                
PG01                14     5
PG02                13     3
PG03              8233  1422
PG04               656   101
PG05               276    32
PG06               307    71
PG07                17     2
PG08                 0     1

Best Performance     0     1
Employee_type               
RM_type_A         6257  1030
RM_type_B         2842   538
RM_type_C          417    69

Best Performance     0     1
gender                      
1                 2483   466
2                 7033  1171

Best Performance               0     1
marital_status_maried(Y/N)            
N                           1392   260
Y                           8124  1377

Best Performance     0     1
Education_level             
level_0              1     0
level_1            113    27
level_2              4     

In [35]:
def create_model():
    embedding_size = 5

    model = models.Sequential()
    model.add(Embedding(input_dim = 8, output_dim = embedding_size, input_length = 1, name="embedding"))
    model.add(Flatten())
    model.add(Dense(50))
    model.add(Dense(15))
    model.add(Dense(1, activation='softmax'))
    return model

In [36]:
data_skf = {'train':[],'val':[]}
skf = StratifiedKFold(n_splits=2,random_state = 3,shuffle = True)
parameters = {'multiply':[['GPA','number_of_dependences']],
              'add':[['annual_leave','sick_leaves'],['assign_of_otherposition','branch_rotation']],
              'add_str':[['Education_level','job_level']],
              'substract':[],'divide':[],
              'bin_numer_qcut':[],
              'bin_numer_cut':[['GPA',30]],
              'bin_add_categ_numer_bin_qcut':[['job_level','GPA',5],['Education_level','GPA',5]],
            'bin_target_encoding_cut':[],
             'bin_target_encoding_qcut':[['year_graduated',5],['GPA',5],['annual_leave',5]],
             'bin_target_encoding_custom_bin':[],
              'categorical_mean_encoding':['job_level','person_level','Employee_type','Education_level'],
             'target':'Best Performance'}








In [43]:
X['person_level']

0        PG03
1        PG03
2        PG06
3        PG06
4        PG06
         ... 
11148    PG03
11149    PG03
11150    PG03
11151    PG03
11152    PG04
Name: person_level, Length: 11153, dtype: object

['job_level',
 'person_level',
 'Employee_type',
 'gender',
 'marital_status_maried(Y/N)',
 'Education_level',
 'Achievement_above_100%_during3quartal']

In [45]:
from sklearn.preprocessing import LabelEncoder


for train_index,val_index in skf.split(X,y):
    X_train = X.iloc[train_index,:]['person_level'].values
    y_train = y.iloc[train_index]

    X_val = X.iloc[val_index,:]['person_level'].values
    y_val = y.iloc[val_index]
    
    LE = LabelEncoder()
    X_train = LE.fit_transform(X_train)
    X_val = LE.transform(X_val)
    
    model = create_model()
    model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics=["accuracy"])
    model.fit(x = X_train, y=y_train.values ,
              validation_data = (X_val,y_val.values), epochs = 5, batch_size = 32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

KeyboardInterrupt: 

- Add str to some numeric
    - Try XGB with str achiv how is the score
    
- yeo johnson transformation
- Kmeans / DBSCAn
- Remove Outlier
- Embedding
- Tuning
- Rule based for more specialized model
- Using no Add fe and ensemble of add _fe etc