In [259]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from sklearn import preprocessing
import datetime
from functools import reduce


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier

In [323]:
x_train = pd.read_csv("D:/History/AV/attrition_data_hackathon/data/x_train.csv")
x_val = pd.read_csv("D:/History/AV/attrition_data_hackathon/data/x_val.csv")
y_train = pd.read_csv("D:/History/AV/attrition_data_hackathon/data/y_train.csv")
y_val = pd.read_csv("D:/History/AV/attrition_data_hackathon/data/y_val.csv")

In [321]:
y_train['target'].value_counts()

0    947
1    221
Name: target, dtype: int64

In [327]:
# combine x and y train datasets
train_data = pd.concat([x_train, y_train], axis=1)
train_data['target'].value_counts()
# Separate majority and minority classes
df_majority = train_data[train_data["target"]==0]
df_minority = train_data[train_data["target"]==1]
 
# # Downsample majority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample without replacement 
                                 n_samples=900,   # to match minority class
                                 random_state=123) # reproducible results
 
# # Combine minority class with downsampled majority class
df_upsampled = pd.concat([df_minority_upsampled, df_majority])

# # Display new class counts
# df_upsampled["target"].value_counts()

# Splitting df_downsampled into x and y
target ='target'
x_train = df_upsampled.drop([target],1)
y_train = df_upsampled[target]



In [328]:
# RandomForest classifier

# model development

params = {'n_estimators':[100, 500, 1000],
          'min_samples_split':[ 5, 10, 15],
          'min_samples_leaf':[10,20]}

model = RandomForestClassifier()
rf_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
rf_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'min_samples_leaf': [10, 20],
                                        'min_samples_split': [5, 10, 15],
                                        'n_estimators': [100, 500, 1000]},
                   scoring='roc_auc', verbose=20)

In [329]:
best_estimator = rf_model.best_estimator_
best_estimator

RandomForestClassifier(min_samples_leaf=10, min_samples_split=10,
                       n_estimators=500)

In [330]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset

rf_model = RandomForestClassifier(min_samples_leaf = 10, min_samples_split = 10, n_estimators = 1000)
rf_model.fit(x_train,
             y_train.values.ravel())

predicted_train = rf_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = rf_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.7239966082532504
confusion_matrix: 
 [[164  68]
 [ 26  35]]
classification_report: 
               precision    recall  f1-score   support

           0       0.86      0.71      0.78       232
           1       0.34      0.57      0.43        61

    accuracy                           0.68       293
   macro avg       0.60      0.64      0.60       293
weighted avg       0.75      0.68      0.70       293



In [302]:
# GradientBoostingClassifier classifier

# model development

params = {'n_estimators':[100, 500],
        'learning_rate':[0.1, 0.01],
        'min_samples_leaf':[1,10,100],
        'min_samples_split': [2, 5],
         'tol': [0.001, 0.0001]}

model = GradientBoostingClassifier()
gb_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
gb_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

RandomizedSearchCV(cv=10, estimator=GradientBoostingClassifier(), n_jobs=-1,
                   param_distributions={'learning_rate': [0.1, 0.01],
                                        'min_samples_leaf': [1, 10, 100],
                                        'min_samples_split': [2, 5],
                                        'n_estimators': [100, 500],
                                        'tol': [0.001, 0.0001]},
                   scoring='roc_auc', verbose=20)

In [303]:
gbt_model_best = gb_model.best_estimator_
gbt_model_best

GradientBoostingClassifier(min_samples_leaf=10, min_samples_split=5,
                           n_estimators=500)

In [304]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = gb_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = gb_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.7721170152628604
confusion_matrix: 
 [[203  29]
 [ 35  26]]
classification_report: 
               precision    recall  f1-score   support

           0       0.85      0.88      0.86       232
           1       0.47      0.43      0.45        61

    accuracy                           0.78       293
   macro avg       0.66      0.65      0.66       293
weighted avg       0.77      0.78      0.78       293



In [305]:
feats = {} 
for feature, importance in zip(x_train.columns, gbt_model_best.feature_importances_):
    feats[feature] = importance 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
feature_importances = importances.sort_values(by='Gini-importance')


In [306]:
feature_importances

Unnamed: 0,Gini-importance
Increment,0.0
Joining Designation,0.005822
Designation,0.011467
promotion,0.016398
avg_quartely_rating,0.035838
Salary_min,0.062539
Salary_max,0.065744
avg_salary,0.065935
total_business_value,0.069786
avg_business_value,0.073407


In [331]:
# Logistic regression on balanced dataset

# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
# x_test_scaled = scaler.transform(x_test)

# Model development

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

params = {'penalty':['l1','l2'],
          'class_weight':['balanced', None],
          'C':np.linspace(0.01,10,100)}
model = LogisticRegression()
lr_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
lr_model.fit(x_train_scaled, 
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

RandomizedSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
                   param_distributions={'C': array([ 0.01      ,  0.11090909,  0.21181818,  0.31272727,  0.41363636,
        0.51454545,  0.61545455,  0.71636364,  0.81727273,  0.91818182,
        1.01909091,  1.12      ,  1.22090909,  1.32181818,  1.42272727,
        1.52363636,  1.62454545,  1.72545455,  1.82636364,  1.92727273,
        2.02818182,  2.12909091,  2.23      ,  2.33090909,  2.4318...
        7.07363636,  7.17454545,  7.27545455,  7.37636364,  7.47727273,
        7.57818182,  7.67909091,  7.78      ,  7.88090909,  7.98181818,
        8.08272727,  8.18363636,  8.28454545,  8.38545455,  8.48636364,
        8.58727273,  8.68818182,  8.78909091,  8.89      ,  8.99090909,
        9.09181818,  9.19272727,  9.29363636,  9.39454545,  9.49545455,
        9.59636364,  9.69727273,  9.79818182,  9.89909091, 10.        ]),
                                        'class_weight': ['balanced', None],
                  

In [332]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = lr_model.predict_proba(x_train_scaled)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = lr_model.predict_proba(x_val_scaled)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.6823770491803278
confusion_matrix: 
 [[ 72 160]
 [  5  56]]
classification_report: 
               precision    recall  f1-score   support

           0       0.94      0.31      0.47       232
           1       0.26      0.92      0.40        61

    accuracy                           0.44       293
   macro avg       0.60      0.61      0.44       293
weighted avg       0.79      0.44      0.45       293



In [333]:
from tpot import TPOTClassifier
GENERATIONS = 3
POP_SIZE = 100
CV = 5

tpot = TPOTClassifier(
    generations=GENERATIONS,
    population_size=POP_SIZE,
    random_state=123,
    n_jobs=10,
    cv=CV,
    verbosity=2,
)

tpot.fit(x_train,
             y_train.values.ravel())


Optimization Progress:   0%|          | 0/400 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9550516370028564

Generation 2 - Current best internal CV score: 0.9550516370028564

Generation 3 - Current best internal CV score: 0.9550516370028564

Best pipeline: RandomForestClassifier(RBFSampler(input_matrix, gamma=0.8500000000000001), bootstrap=True, criterion=gini, max_features=0.05, min_samples_leaf=1, min_samples_split=2, n_estimators=100)


TPOTClassifier(generations=3, n_jobs=10, random_state=123, verbosity=2)

In [334]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = tpot.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = tpot.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.602211701526286
confusion_matrix: 
 [[ 76 156]
 [ 19  42]]
classification_report: 
               precision    recall  f1-score   support

           0       0.80      0.33      0.46       232
           1       0.21      0.69      0.32        61

    accuracy                           0.40       293
   macro avg       0.51      0.51      0.39       293
weighted avg       0.68      0.40      0.44       293



In [292]:
estimators = [('gb', gb_model),
              ('rf', rf_model)]
final_estimator = lr_model
              
stacking_model = StackingClassifier(estimators = estimators, 
                                  final_estimator = final_estimator)
stacking_model.fit(x_train,
                   y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1999s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  20 tas

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1830s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  22 tasks      | elap

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1556s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  22 tasks      | elap

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.9s
[Paralle

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.9s
[Paralle

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0129s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0170s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jo

StackingClassifier(estimators=[('gb',
                                RandomizedSearchCV(cv=10,
                                                   estimator=GradientBoostingClassifier(),
                                                   n_jobs=-1,
                                                   param_distributions={'learning_rate': [0.1,
                                                                                          0.01],
                                                                        'min_samples_leaf': [1,
                                                                                             10,
                                                                                             100],
                                                                        'min_samples_split': [2,
                                                                                              5],
                                                                        

In [293]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = stacking_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = stacking_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.8997309855710443
confusion_matrix: 
 [[114  31]
 [ 24 117]]
classification_report: 
               precision    recall  f1-score   support

           0       0.83      0.79      0.81       145
           1       0.79      0.83      0.81       141

    accuracy                           0.81       286
   macro avg       0.81      0.81      0.81       286
weighted avg       0.81      0.81      0.81       286



In [105]:
# Predictions

In [274]:
def data_preprocess(df, start_date, cut_off_date):
    '''
    inputs:
    dataset to be processed
    start_date: the start date of the 'LastWorkingDate' 
    cut_off_date: the end date of the training dataset 
    last_date: end date of validation/test dataset
    
    output:
    data: a merge of several dataframes engineered from raw dataset
    1. df_demograph: dataframe of unique values for demographic data
    2. df_salary_change: dataframe engineered from 'Salary'to derive the 'increment' column
    3. df_promotion: dataframe engineered from 'Joining Designation' and 'Designation' to derive 'Promotion'
    4. df_total: dataframe engineered to derive the total business-value of the employee
    5. df_average: dataframe engineered to derive the total business-value of the employee
    6. df_salary_change:
    7. df_reporting:                                                                    
    8. df_target:                                                                   
                                                                        
    
    
    Promotion forms a significant part in establishing job performance for
    the employer and job satisfaction for the employee. Also if any increase in the designation was calculated based
    on joining designation and the designation.
    4. df_total: A part of Feature Engineering. It is the total business value generated by the hardworking employee
    throughout his tenure or the cutoff date
    5. df_average:  A part of Feature Engineering. average_business_value, average_salary and average_quarterly_rating 
    generated by the employee throughout his tenure or the cutoff date
    6. df_reporting: A part of Feature Engineering. Total reporting count of each employee
    7. df_working_days: A part of Feature Engineering. Total number of employment days for each employee. If the employee
    has last working day, then the days were counted from joining date to last working day else joining date to cutoff date
    
    Target varibale is created using cutoff date, to prepare the training data, cutoff date will be 1 July 2017 that is
    all employees who had attrition from 1 July 2017 to 31 Dec 2017 will be labeled as 1. Those who have not yet given
    their resignation till 1 July 2017 will be tagged as 0. 
       
    
    '''
    # Filtering data suitable for creating training data
    df = df[(df['MMM-YY']>start_date) & (df['MMM-YY']<cut_off_date)]    
    df = df[(df['Dateofjoining']<cut_off_date)]  
    
    # Demographic dataset creation for each employee
    df_demograph = df[['Emp_ID', 
                   'Age', 
                   "Gender", 
                   "City", 
                   "Education_Level"]].groupby('Emp_ID').max().reset_index()
    
    # Feature Engineer - min_salary, max_salary and increment
    df_salary_min = df.groupby('Emp_ID').min()['Salary'].reset_index()
    df_salary_min.columns =['Emp_ID', 
                            'Salary_min']
    df_salary_max = df.groupby('Emp_ID').max()['Salary'].reset_index()
    df_salary_max.columns =['Emp_ID', 
                            'Salary_max']    
    df_salary_change = pd.merge(df_salary_min, 
                                df_salary_max, 
                                how = 'inner', 
                                on = 'Emp_ID')
    df_salary_change['Increment'] = ((df_salary_change['Salary_max'] - df_salary_change['Salary_min'])/df_salary_change['Salary_min'] *100).astype(int)
    
    # Feature Engineering - Promotion if there is any increase in the designation. The more the number, higher the promotion
    df_promotion = df
    df_promotion["promotion"] = np.where(df_promotion['Joining Designation']==df_promotion['Designation'], 0, 
                                     df_promotion['Designation'] - df_promotion['Joining Designation'])
    df_promotion = df_promotion[['Emp_ID',
                                 'Joining Designation', 
                                 'Designation', 
                                 'promotion']].groupby("Emp_ID").max().reset_index()
    
    # Feature Engineering - total business value generated by the employee throughout his tenure or the cutoff date
    df_total = df.groupby('Emp_ID').sum()
    df_total = df_total["Total Business Value"].reset_index()
    df_total = df_total.set_axis(['Emp_ID', 
                                  'total_business_value'], 
                                 axis=1, 
                                 inplace=False)
    
    # Feature Engineering - average business value, average salary and average quarterly rating generated by the 
    #employee throughout his tenure or the cutoff date
    
    df_average = df.groupby('Emp_ID').mean()
    df_average = df_average[['Salary', 
                             'Total Business Value', 
                             'Quarterly Rating']].reset_index()
    df_average = df_average.set_axis(['Emp_ID', 
                                      'avg_salary', 
                                      'avg_business_value', 
                                      'avg_quartely_rating'], 
                                     axis=1, 
                                     inplace=False)
    
    # Feature Engineering - total reporting count by each employee
    df_reporting = df.groupby(['Emp_ID']).count()[['MMM-YY']].reset_index()
    df_reporting.columns = ["Emp_ID", 
                            "total_reportings"]
    
    # Feature Engineering - total working days by each employee
    df_working_days = df.groupby(['Emp_ID']).max().reset_index()
    df_working_days['number_employment_days'] = np.where(df_working_days['LastWorkingDate']<cut_off_date, 
                                                          df_working_days['LastWorkingDate'] - df_working_days["Dateofjoining"],
                                                          cut_off_date - df_working_days["Dateofjoining"])
    df_working_days = df_working_days[['Emp_ID',  
                                       "number_employment_days"]]
    
    df_target = df
    df_target['target'] = np.where(df['LastWorkingDate']<cut_off_date, 1, 0)
#     df_target['churn'] = np.where(df['LastWorkingDate']<last_date, 1, 0)
    df_target = df_target[['Emp_ID',
                            'Dateofjoining',
                            'LastWorkingDate',
                            'target']]
    df_target = df_target.groupby('Emp_ID').max()
    
    data = reduce(lambda x,y: pd.merge(x,y, on='Emp_ID', how='inner'), [df_demograph, 
                                                                        df_average, 
                                                                        df_working_days, 
                                                                        df_promotion, 
                                                                        df_salary_change, 
                                                                        df_total, 
                                                                        df_reporting, 
                                                                        df_target])    
    data["business_value_index"] = round(data["total_business_value"]/data["number_employment_days"].dt.days.astype('int16'),2)
    return data

In [275]:
df = pd.read_csv('D:/History/AV/attrition_data_hackathon/data/train_MpHjUjU.csv')
df[['MMM-YY','Dateofjoining', 'LastWorkingDate']] = df[['MMM-YY',
                                                        'Dateofjoining', 
                                                        'LastWorkingDate']].apply(pd.to_datetime, 
                                                                                  format='%Y-%m-%d')
start_date = np.datetime64(datetime.datetime(2017,1,1))
cut_off_date = np.datetime64(datetime.datetime(2018,1,1))

In [276]:
test_data = pd.read_csv('D:/History/AV/attrition_data_hackathon/data/test_hXY9mYw.csv')
# test_data.head()

In [277]:
df = df.merge(test_data, how = 'inner', on = 'Emp_ID')

In [278]:
data = data_preprocess(df, start_date, cut_off_date)

In [218]:
#data = pd.get_dummies(data, columns = ["City"])


In [279]:
data['number_employment_days'] = data['number_employment_days'].dt.days

In [254]:
data.drop(['Age', 'Gender', 'Education_Level', 'Dateofjoining', 'LastWorkingDate', 'target', 'City'], axis = 1, inplace = True)

In [280]:
data['business_value_index'] = data['business_value_index'].fillna(0)

In [283]:
data = data[['Emp_ID', 'number_employment_days', 'Designation', 'promotion', 'Increment', 'business_value_index']]

In [294]:
target='target'
primary_key = "Emp_ID"

x_pk_predict = data[primary_key]
x_predict = data.drop([primary_key],1)

predicted = stacking_model.predict(x_predict)
predict_df = pd.DataFrame(predicted, 
                          columns = ["Target"])

output_dataframe = pd.merge(x_pk_predict, 
                            predict_df, 
                            how="left", 
                            left_index=True, 
                            right_index=True)



In [295]:
output_dataframe.to_csv('D:/History/AV/attrition_data_hackathon/output/prediction10.csv')