# Module and Function

In [87]:
import pandas as pd
import numpy as np
import os

import seaborn as sns
import matplotlib.pyplot as plt
import random


import os
import time
import datetime
import calendar

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import KFold, StratifiedKFold


from sklearn import clone
from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_recall_fscore_support, roc_auc_score


# from utils_model import * # expand later

In [88]:
def fast_build_model_FE(X,y,cv,Feature_Engineering,parameters, model_base=LogisticRegression(class_weight='balanced'),random_state=0,shuffle=False):

    num_transformer = Pipeline(steps=[
                                    ('imputer', SimpleImputer(strategy = 'median')),
                                    ('scaler', RobustScaler())
                                    ])

    cat_transformer = Pipeline(steps=[
                                    ('imputer', SimpleImputer(strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                    ])

    


    
    
    skf = StratifiedKFold(n_splits=cv,random_state = random_state,shuffle = shuffle)


    # oof validation
    oof_y_valid = []
    oof_y_valid_pred = []
    oof_y_valid_pred_proba = []
    pipelines = []
    add_fes = []
    data = pd.concat([X,y],axis=1)
    aucs=[]
    
    for cv,(train_index, val_index) in enumerate(skf.split(X,y)):
        start_fit = time.time()
        data_train = data.iloc[train_index,:].copy()
        
        add_fe = Feature_Engineering(parameters)
        add_fe.fit(data_train)
        
        X_train = add_fe.transform(data_train).drop(columns=[parameters['target']])
        y_train = y.iloc[train_index]
        X_val = add_fe.transform(X.iloc[val_index,:],mode='val')
        y_val = y.iloc[val_index]
        
        num_cols_fe = list(X_train.select_dtypes(exclude='object').columns)
        cat_cols_fe = list(X_train.select_dtypes(include='object').columns)        
        
#         print(X_train.shape,X_val.shape,data.shape)
        
        transformer = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_cols_fe),
            ('cat', cat_transformer, cat_cols_fe)
        ])
        
        main_pipeline = Pipeline(steps=[('transformer', transformer),
                          ('classifier', model_base)])
        

        
        add_fes.append(add_fe)
        model = clone(main_pipeline)
        model.fit(X_train,y_train.values.ravel())
        pred = model.predict(X_val)
        pred_proba = model.predict_proba(X_val)[:,1]
        oof_y_valid_pred.extend(pred)
        oof_y_valid_pred_proba.extend(pred_proba)
        oof_y_valid.extend(y_val.values)
        aucs.append(roc_auc_score(y_val.values, pred_proba,average='weighted'))
        pipelines.append(model)
        print(f'Fit iteration {cv} done in : {str(time.time()-start_fit)}')

    prec,rec,f1, _ = precision_recall_fscore_support(oof_y_valid,oof_y_valid_pred)
    auc = roc_auc_score(oof_y_valid, oof_y_valid_pred_proba,average='weighted')
    print(f'PRec Rec AUC average : {prec} {rec} <==> {auc}')
    print(aucs)
    print(np.mean(aucs[:2]))
    return add_fes,pipelines

In [89]:
def fast_predict_FE(data,add_fes,pipelines):
    X = data.copy()
#     pred = np.zeros(1,len(X))
    pred_proba = np.zeros((len(X)))
    dude={}
    for i in range(len(pipelines)):
        
        pred_proba += pipelines[i].predict_proba(add_fes[i].transform(X,mode='test'))[:,1] / len(pipelines)
        dude[i] = pipelines[i].predict_proba(add_fes[i].transform(X,mode='test'))[:,1]
    
    return pred_proba,dude



        

In [90]:
class Feature_Engineering:
    def __init__(self,parameters):
        self.parameters = parameters
        self.target = parameters['target']
    
    @staticmethod  
    def check_col(col):
        if len(col.split(' '))>1:
            col2 = '_'.join(col.split(' '))
        else:
            col2 = col
        return col2
    
    @staticmethod
    def get_bin(data,col,n_bin,mode='cut'):
        while True:
            try:
                if mode=='cut':
                    _,bin_dummy = pd.cut(data[col],n_bin,retbins=True)
                else:
                    _,bin_dummy = pd.qcut(data[col],n_bin,retbins=True)
            except:
                n_bin -= 1
                continue
            break
        return bin_dummy
        
    def fit(self,data_ori):
        target = self.target
        data = data_ori.copy()
        for param in self.parameters['bin_numer_qcut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='qcut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_numer_qcut',bin_dummy)
        for param in self.parameters['bin_numer_cut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='cut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_numer_cut',bin_dummy)
            
            
        for param in self.parameters['bin_add_categ_numer_bin_cut']:
            col = param[1]
            n_bin = param[2]
            bin_dummy = self.get_bin(data,col,n_bin,mode='cut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_cut_add_categ',bin_dummy)
            
        for param in self.parameters['bin_add_categ_numer_bin_qcut']:
            col = param[1]
            n_bin = param[2]
            bin_dummy = self.get_bin(data,col,n_bin,mode='qcut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_qcut_add_categ',bin_dummy)
        
        for param in self.parameters['bin_target_encoding_cut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='cut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_cut',bin_dummy)
            
            data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy)
            data_dummy = data.groupby([f'{col}_bin_target_encoding_cut'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_bin_target_encoding_cut',data_dummy)
            
        for param in self.parameters['bin_target_encoding_qcut']:
            col = param[0]
            n_bin = param[1]
            bin_dummy = self.get_bin(data,col,n_bin,mode='qcut')
            bin_dummy[0] = bin_dummy[0]-0.001
            bin_dummy[-1] = np.inf
            setattr(self,f'{col}_bin_qcut',bin_dummy)
            
            data[f'{col}_bin_target_encoding_qcut'] = pd.cut(data[col],bins=bin_dummy)
            data_dummy = data.groupby([f'{col}_bin_target_encoding_qcut'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_bin_target_encoding_qcut',data_dummy)
           
        for param in self.parameters['bin_target_encoding_custom_bin']:
            col = param[0]
            bins = param[1]
            setattr(self,f'{col}_bin_custom_bin',bins)
            
            data[f'{col}_bin_target_encoding_custom_bin'] = pd.cut(data[col],bins=bins)
            data_dummy = data.groupby([f'{col}_bin_target_encoding_custom_bin'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_bin_target_encoding_custom_bin',data_dummy)
        
        for param in self.parameters['categorical_mean_encoding']:
            col = param
            data[f'{col}_categorical_mean_encoding'] = data[col].copy().values
            data_dummy = data.groupby([f'{col}_categorical_mean_encoding'])[target].mean().reset_index(drop=False)
            setattr(self,f'{col}_categorical_mean_encoding',data_dummy)
        
        
        
        self.fit = True
        return data
        
    
    def transform(self,X,mode='train'):
        data = X.copy()
        target = self.target
        if mode!='train':
            target_encode = self.target
        else:
            target_encode = self.target +"_y"
            
        if self.fit==False:
            raise Exception("Fit to train data first")
        
        for param in self.parameters['bin_numer_qcut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_numer_qcut')
            data[f'{col}_bin_numer_qcut'] = pd.cut(data[col],bins=bin_dummy).astype(str).values
        for param in self.parameters['bin_numer_cut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_numer_cut')
            data[f'{col}_bin_numer_cut'] = pd.cut(data[col],bins=bin_dummy).astype(str).values
            
        for cols in self.parameters['bin_add_categ_numer_bin_cut']:
            col_add = cols[0] + '_' + cols[1]
            bin_dummy = eval(f'self.{cols[1]}_bin_cut_add_categ')
            data[f'{col_add}_bin_add_categ_numer_bin_cut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
            data[f'{col_add}_bin_add_categ_numer_bin_cut'] = (data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_cut'].astype(str)).values
            
        for cols in self.parameters['bin_add_categ_numer_bin_qcut']:
            col_add = cols[0] + '_' + cols[1]
            bin_dummy = eval(f'self.{cols[1]}_bin_qcut_add_categ')
            data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = pd.cut(data[cols[1]],bins=bin_dummy).values
            data[f'{col_add}_bin_add_categ_numer_bin_qcut'] = (data[cols[0]].astype(str)+'_' + data[f'{col_add}_bin_add_categ_numer_bin_qcut'].astype(str)).values
        
        for param in self.parameters['bin_target_encoding_cut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_cut')
            data_dummy = eval(f'self.{col}_bin_target_encoding_cut')
            data[f'{col}_bin_target_encoding_cut'] = pd.cut(data[col],bins=bin_dummy).values
            data[f'{col}_bin_target_encoding_cut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_cut'])[f'{target_encode}'].values
        
        for param in self.parameters['bin_target_encoding_qcut']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_qcut')
            data_dummy = eval(f'self.{col}_bin_target_encoding_qcut')
            data[f'{col}_bin_target_encoding_qcut'] = pd.cut(data[col],bins=bin_dummy).values
            data[f'{col}_bin_target_encoding_qcut'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_qcut'])[f'{target_encode}'].values
        
        for param in self.parameters['bin_target_encoding_custom_bin']:
            col = param[0]
            bin_dummy = eval(f'self.{col}_bin_custom_bin')
            data_dummy = eval(f'self.{col}_bin_target_encoding_custom_bin')
            data[f'{col}_bin_target_encoding_custom_bin'] = pd.cut(data[col],bins=bin_dummy).values
            data[f'{col}_bin_target_encoding_custom_bin'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_bin_target_encoding_custom_bin'])[f'{target_encode}'].values
        
        for param in self.parameters['categorical_mean_encoding']:
            col = param
            data_dummy = eval(f'self.{col}_categorical_mean_encoding')
            data[f'{col}_categorical_mean_encoding'] = data[col].copy().values
            data[f'{col}_categorical_mean_encoding'] = pd.merge(data,data_dummy,how='left',on=[f'{col}_categorical_mean_encoding'])[f'{target_encode}'].values
        
        
        for cols in self.parameters['multiply']:
            data[cols[0] + '_times_' +cols[1]] = (data[cols[0]] * data[cols[1]]).values
        for cols in self.parameters['add']:
            data[cols[0] + '_plus_' +cols[1]] = (data[cols[0]] + data[cols[1]]).values
        for cols in self.parameters['add_str']:
            data[cols[0] + '_plus_' +cols[1]] = (data[cols[0]].astype(str)+'_' + data[cols[1]].astype(str)).values
            
        for cols in self.parameters['substract']:
            data[cols[0] + '_minus_' +cols[1]] = (data[cols[0]] - data[cols[1]]).values
        for cols in self.parameters['divide']:
            data[cols[0] + '_divide_' +cols[1]] = (data[cols[0]] / np.where(data[cols[1]]==0,0.0001,data[cols[1]])).values
        
        
        
        
        return data
    
    

# Data


In [133]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

data = data.rename(columns={'Last_achievement_%':'Last_achievement','marital_status_maried(Y/N)':'marital_status_maried',
                           'Achievement_above_100%_during3quartal':'Achievement_above_100_during3quartal'})
data_test = data_test.rename(columns={'Last_achievement_%':'Last_achievement','marital_status_maried(Y/N)':'marital_status_maried',
                           'Achievement_above_100%_during3quartal':'Achievement_above_100_during3quartal'})

data['gender_str'] = data['gender'].astype('str')
data_test['gender_str'] = data_test['gender'].astype('str')


data['Achievement_above_100_during3quartal_str'] = data['Achievement_above_100_during3quartal'].astype(str)
data_test['Achievement_above_100_during3quartal_str'] = data_test['Achievement_above_100_during3quartal'].astype(str)

data = data.rename(columns={'annual leave':'annual_leave'})
data_test = data_test.rename(columns={'annual leave':'annual_leave'})

data['person_level_ordinary'] = LE.fit_transform(data['person_level'])
data.drop
data['job_level_ordinary'] = LE.fit_transform(data['job_level'])
data['Education_level_ordinary'] = LE.fit_transform(data['Education_level'])

data_test['person_level_ordinary'] = LE.fit_transform(data_test['person_level'])
data_test['job_level_ordinary'] = LE.fit_transform(data_test['job_level'])
data_test['Education_level_ordinary'] = LE.fit_transform(data_test['Education_level'])



drop = ['age', 'job_duration_from_training', 'person_level_ordinary', 'job_level']


data = data.drop(columns=drop)
data_test = data_test.drop(columns=drop)



In [134]:
# get test data (for final evaluation)
X_train = data.drop(columns=['Best Performance'])
y_train = data['Best Performance']
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

In [135]:
# len(X_train),len(X_val)

In [136]:
num_cols = list(X_train.select_dtypes(exclude=['object']))
cat_cols = list(X_train.select_dtypes(include=['object']))
features = list(X_train.columns)
len(num_cols),len(cat_cols),len(features)

(16, 6, 22)

# training

In [137]:
parameters = {'multiply':[['job_duration_in_current_job_level','number_of_dependences'],
                         ['job_duration_in_current_job_level','job_rotation'],
                         ['job_duration_in_current_job_level','job_level_ordinary'],
                         ['gender','GPA'],['gender','year_graduated'],['gender','sick_leaves'],['gender','job_level_ordinary'],
                         ['number_of_dependences','year_graduated'],['number_of_dependences','annual_leave'],['number_of_dependences','job_level_ordinary'],
                         ['GPA','branch_rotation'],['year_graduated','job_level_ordinary']],
              'add':[],
              'add_str':[['person_level','Education_level'],['Employee_type','Education_level']],
              'substract':[],
              'divide':[],
              'bin_numer_qcut':[['assign_of_otherposition',10],['Last_achievement',10]],
              'bin_numer_cut':[['Last_achievement',20]],
              'bin_add_categ_numer_bin_qcut':[],
              'bin_add_categ_numer_bin_cut':[],
            'bin_target_encoding_cut':[['job_duration_in_current_job_level',10],['assign_of_otherposition',20]],
             'bin_target_encoding_qcut':[['job_duration_in_current_branch',10]
                                        ],
             'bin_target_encoding_custom_bin':[],
              'categorical_mean_encoding':[],
             'target':'Best Performance'}


In [141]:
cv=5
add_fes,pipelines = fast_build_model_FE(X_train,y_train,cv,
                    Feature_Engineering,parameters,model_base=LogisticRegression(class_weight='balanced',random_state=0,max_iter=3000,C=1),shuffle=True,random_state=3)

Fit iteration 0 done in : 5.138310432434082
Fit iteration 1 done in : 7.602585792541504
Fit iteration 2 done in : 4.447324752807617
Fit iteration 3 done in : 6.385558843612671
Fit iteration 4 done in : 5.9272685050964355
PRec Rec AUC average : [0.87854624 0.1824266 ] [0.60203867 0.51618815] <==> 0.5798468733365636
[0.6027211343252896, 0.58635594632352, 0.5845904412801354, 0.575187093933448, 0.5546047525153428]
0.5945385403244048


In [132]:
pred_proba,_ = fast_predict_FE(data_test.copy(),add_fes,pipelines)

df_submission = pd.DataFrame({'index':data_test.index,'Best Performance':pred_proba})
df_submission


Unnamed: 0,index,Best Performance
0,0,0.537903
1,1,0.666084
2,2,0.770454
3,3,0.555947
4,4,0.702825
...,...,...
5995,5995,0.587593
5996,5996,0.572165
5997,5997,0.537532
5998,5998,0.537482


In [142]:
df_submission.to_csv('answer_submission_1.csv',index=False)