In [1]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

In [2]:
train = pd.read_csv('Train_Sample.csv')
test=pd.read_csv('HR test.csv')


In [3]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [4]:
test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [5]:
train.shape

(54808, 14)

In [6]:
test.shape

(23490, 13)

In [7]:
train.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [8]:
test.isnull().sum()

employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64

In [9]:
train.isnull().mean()

employee_id             0.000000
department              0.000000
region                  0.000000
education               0.043953
gender                  0.000000
recruitment_channel     0.000000
no_of_trainings         0.000000
age                     0.000000
previous_year_rating    0.075244
length_of_service       0.000000
KPIs_met >80%           0.000000
awards_won?             0.000000
avg_training_score      0.000000
is_promoted             0.000000
dtype: float64

In [10]:
test.isnull().mean()

employee_id             0.000000
department              0.000000
region                  0.000000
education               0.044019
gender                  0.000000
recruitment_channel     0.000000
no_of_trainings         0.000000
age                     0.000000
previous_year_rating    0.077139
length_of_service       0.000000
KPIs_met >80%           0.000000
awards_won?             0.000000
avg_training_score      0.000000
dtype: float64

In [11]:
train['education'].unique()

array(["Master's & above", "Bachelor's", nan, 'Below Secondary'],
      dtype=object)

In [12]:
test['education'].unique()

array(["Bachelor's", "Master's & above", nan, 'Below Secondary'],
      dtype=object)

In [13]:
train['previous_year_rating'].unique()

array([ 5.,  3.,  1.,  4., nan,  2.])

In [14]:
test['previous_year_rating'].unique()

array([nan,  3.,  1.,  2.,  4.,  5.])

In [15]:
for k,df in train.groupby(['is_promoted']):
    print("is_promoted: ",k)
    print(df.groupby(['education'])['education'].count())

is_promoted:  0
education
Bachelor's          33661
Below Secondary       738
Master's & above    13454
Name: education, dtype: int64
is_promoted:  1
education
Bachelor's          3008
Below Secondary       67
Master's & above    1471
Name: education, dtype: int64


In [16]:
for k,df in train.groupby(['is_promoted']):
    print("is_promoted: ",k)
    print(df.groupby(['previous_year_rating'])['previous_year_rating'].count())

is_promoted:  0
previous_year_rating
1.0     6135
2.0     4044
3.0    17263
4.0     9093
5.0     9820
Name: previous_year_rating, dtype: int64
is_promoted:  1
previous_year_rating
1.0      88
2.0     181
3.0    1355
4.0     784
5.0    1921
Name: previous_year_rating, dtype: int64


In [17]:
def impute_na(df_train, df_test, variable):
    most_frequent_category = df_train.groupby([variable])[variable].count().sort_values(ascending=False).index[0]
    df_train[variable].fillna(most_frequent_category, inplace=True)
    df_test[variable].fillna(most_frequent_category, inplace=True)

In [18]:
impute_na(train, test, 'education')

In [19]:
impute_na(train, test, 'previous_year_rating')

In [20]:
train.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [21]:
test.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64

In [22]:
for col in train.columns:
    print("Unique values in ",col," : ",train[col].nunique()) 

Unique values in  employee_id  :  54808
Unique values in  department  :  9
Unique values in  region  :  34
Unique values in  education  :  3
Unique values in  gender  :  2
Unique values in  recruitment_channel  :  3
Unique values in  no_of_trainings  :  10
Unique values in  age  :  41
Unique values in  previous_year_rating  :  5
Unique values in  length_of_service  :  35
Unique values in  KPIs_met >80%  :  2
Unique values in  awards_won?  :  2
Unique values in  avg_training_score  :  61
Unique values in  is_promoted  :  2


In [23]:
for col in test.columns:
    print("Unique values in ",col," : ",test[col].nunique())

Unique values in  employee_id  :  23490
Unique values in  department  :  9
Unique values in  region  :  34
Unique values in  education  :  3
Unique values in  gender  :  2
Unique values in  recruitment_channel  :  3
Unique values in  no_of_trainings  :  9
Unique values in  age  :  41
Unique values in  previous_year_rating  :  5
Unique values in  length_of_service  :  34
Unique values in  KPIs_met >80%  :  2
Unique values in  awards_won?  :  2
Unique values in  avg_training_score  :  61


In [24]:
train.describe()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.830627,1.253011,34.803915,3.304481,5.865512,0.351974,0.023172,63.38675,0.08517
std,22586.581449,0.609264,7.660169,1.21477,4.265094,0.47759,0.15045,13.371559,0.279137
min,1.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


In [25]:
test.describe()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
count,23490.0,23490.0,23490.0,23490.0,23490.0,23490.0,23490.0,23490.0
mean,39041.399149,1.254236,34.782929,3.312984,5.810387,0.358834,0.022776,63.263133
std,22640.809201,0.60091,7.679492,1.216959,4.207917,0.479668,0.149191,13.41175
min,3.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0
25%,19370.25,1.0,29.0,3.0,3.0,0.0,0.0,51.0
50%,38963.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0
75%,58690.0,1.0,39.0,4.0,7.0,1.0,0.0,76.0
max,78295.0,9.0,60.0,5.0,34.0,1.0,1.0,99.0


In [26]:
train_df = train.drop(['employee_id'], axis=1)

In [27]:
test_df = test.drop(['employee_id'], axis=1)

In [28]:
num_vars = ['age','length_of_service','avg_training_score']

In [29]:
cat_vars = [col for col in test_df.columns if col not in num_vars]

In [30]:
def cat_enc(df_train,df_test,variable,target):
    
    prob_df = df_train.groupby([variable])[target].mean()
    prob_df = pd.DataFrame(prob_df)
    prob_df['Not_' + target] = 1-prob_df[target]
    
    prob_df.loc[prob_df[target] == 0, target] = 0.00001
    prob_df.loc[prob_df['Not_' + target] == 0, 'Not_' + target] = 0.00001
    
    prob_df['WoE'] = np.log(prob_df[target]/prob_df['Not_' + target])
    
    woe_labels = prob_df['WoE'].to_dict()
    
    df_train[variable] = df_train[variable].map(woe_labels)
    df_test[variable] = df_test[variable].map(woe_labels)

In [31]:
for col in cat_vars:
    cat_enc(train_df,test_df,col,'is_promoted')

In [32]:
train_df.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,-2.555904,-2.126523,-2.213334,-2.314444,-2.376107,-2.336974,35,-1.631575,8,-1.592059,-2.487358,49,0
1,-2.311829,-2.048659,-2.44104,-2.400278,-2.389825,-2.336974,30,-1.631575,4,-3.188871,-2.487358,60,0
2,-2.555904,-2.740231,-2.44104,-2.400278,-2.376107,-2.336974,34,-2.519713,7,-3.188871,-2.487358,50,0
3,-2.555904,-2.02507,-2.44104,-2.400278,-2.389825,-2.501571,39,-4.244429,10,-3.188871,-2.487358,50,0
4,-2.115565,-2.694911,-2.44104,-2.400278,-2.389825,-2.336974,45,-2.519713,2,-3.188871,-2.487358,73,0


In [33]:
X = train_df.drop(['is_promoted'], axis=1).values
y = train_df['is_promoted'].values

In [34]:
sm = SMOTE(random_state=101)
X_res, y_res = sm.fit_sample(X, y.ravel())

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, 
                                                    test_size=0.3, 
                                                    random_state=101)

In [36]:
# Importing the Keras libraries and packages
import keras
from keras.utils import plot_model
from keras.models import Model,Sequential,load_model
from keras.layers import Input, Flatten, Dense, Dropout
from keras.layers.merge import concatenate
from keras import backend as K
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [37]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [38]:
import tensorflow as tf
from sklearn.metrics import roc_auc_score

def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

In [39]:
input_shape = X_train.shape[1]


In [40]:
input_layer = Input(shape=(input_shape,))

hidden1 = Dense(units = X_train.shape[1],kernel_initializer = 'uniform',
                activation = 'relu')(input_layer)
hidden2 = Dense(units = 6,kernel_initializer = 'uniform',
                activation = 'relu')(hidden1)
dropout = Dropout(rate=0.2)(hidden2)
output_layer = Dense(units = 1,kernel_initializer = 'uniform',
                activation = 'sigmoid')(dropout)

model = Model(inputs=input_layer, outputs=output_layer)

# Compiling the ANN
model.compile(optimizer = 'adamax', 
              loss = 'binary_crossentropy', 
              metrics = [auroc])

# summarize layers
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 12)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 12)                156       
_________________________________________________

In [41]:
# checkpoint
filepath="Best_Model.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_auroc', verbose=1, 
                             save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [42]:
model.fit(X_train, 
          y_train,
          batch_size = 32,
          epochs = 20,
          validation_data=(X_test, y_test),
          callbacks=callbacks_list, 
          verbose=2)

Instructions for updating:
Use tf.cast instead.
Train on 70196 samples, validate on 30084 samples
Epoch 1/20
 - 24s - loss: 0.5951 - auroc: 0.7409 - val_loss: 0.5146 - val_auroc: 0.7933

Epoch 00001: val_auroc improved from -inf to 0.79330, saving model to Best_Model.h5
Epoch 2/20
 - 20s - loss: 0.5135 - auroc: 0.7903 - val_loss: 0.5053 - val_auroc: 0.7968

Epoch 00002: val_auroc improved from 0.79330 to 0.79680, saving model to Best_Model.h5
Epoch 3/20
 - 20s - loss: 0.5049 - auroc: 0.7927 - val_loss: 0.4929 - val_auroc: 0.7993

Epoch 00003: val_auroc improved from 0.79680 to 0.79927, saving model to Best_Model.h5
Epoch 4/20
 - 25s - loss: 0.4975 - auroc: 0.7958 - val_loss: 0.4842 - val_auroc: 0.8014

Epoch 00004: val_auroc improved from 0.79927 to 0.80142, saving model to Best_Model.h5
Epoch 5/20
 - 23s - loss: 0.4953 - auroc: 0.7970 - val_loss: 0.4802 - val_auroc: 0.8018

Epoch 00005: val_auroc improved from 0.80142 to 0.80181, saving model to Best_Model.h5
Epoch 6/20
 - 29s - loss:

<keras.callbacks.History at 0x15853d28748>

In [43]:
input_layer = Input(shape=(input_shape,))

hidden1 = Dense(units = X_train.shape[1],kernel_initializer = 'uniform',
                activation = 'relu')(input_layer)
hidden2 = Dense(units = 6,kernel_initializer = 'uniform',
                activation = 'relu')(hidden1)
dropout = Dropout(rate=0.2)(hidden2)
output_layer = Dense(units = 1,kernel_initializer = 'uniform',
                activation = 'sigmoid')(dropout)

model = Model(inputs=input_layer, outputs=output_layer)

In [44]:
model.load_weights('Best_Model.h5')

In [45]:
test_df.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,-2.115565,-2.694911,-2.44104,-2.400278,-2.376107,-2.336974,24,-2.519713,1,-1.592059,-2.487358,77
1,-2.820153,-1.778803,-2.44104,-2.314444,-2.389825,-2.336974,31,-2.519713,5,-3.188871,-2.487358,51
2,-2.555904,-2.352617,-2.44104,-2.400278,-2.389825,-2.336974,31,-4.244429,4,-3.188871,-2.487358,47
3,-2.238047,-2.440631,-2.44104,-2.314444,-2.389825,-2.606931,31,-3.106493,9,-3.188871,-2.487358,65
4,-2.425747,-3.096314,-2.44104,-2.400278,-2.376107,-2.336974,30,-2.450851,7,-3.188871,-2.487358,61


In [51]:
y_predictions = np.where(model.predict(test_df.values)<0.5,0,1)

In [52]:
submission_df = pd.concat([test['employee_id'], 
                           pd.DataFrame(y_preds, columns=['is_promoted'])],
                          axis=1)

In [53]:
submission_df.to_csv(r'HR-Analytics submission.csv', index=False)