In [1]:
import pandas as pd
import os
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_curve, f1_score
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel('EmployeeAttritionPredictionData.xlsx', sheet_name='Data')
df.head()

Unnamed: 0,ID,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,Gender,JobLevel,JobRole,...,MonthlyIncome,NumCompaniesWorked,Overtime,MostRecentPerfEval,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,training
0,100774131,35,No,Travel_Rarely,Research & Development,1,3,Male,1,Research Scientist,...,2859,1,0,6,6,6,4,0,4,0
1,101330756,30,No,Travel_Rarely,Research & Development,8,2,Female,3,Research Director,...,11416,0,1,5,9,8,7,1,7,1
2,101910614,27,Yes,Travel_Frequently,Human Resources,22,3,Female,1,Human Resources,...,2863,1,0,4,1,1,0,0,0,1
3,102548783,39,No,Travel_Rarely,Research & Development,18,2,Male,2,Manufacturing Director,...,4534,0,0,6,9,8,7,1,7,1
4,103053549,36,No,Travel_Rarely,Research & Development,5,2,Male,2,Laboratory Technician,...,5914,8,0,5,16,13,11,3,7,1


In [3]:
for col in df.columns:
    print(col, df[col].dtype)


cat_vars = [  col for col in df.columns if df[col].dtype == 'O']
cat_vars.append('JobLevel')
cat_vars.remove('Attrition')
cat_vars

ID int64
Age int64
Attrition object
BusinessTravel object
Department object
DistanceFromHome int64
Education int64
Gender object
JobLevel int64
JobRole object
MaritalStatus object
MonthlyIncome int64
NumCompaniesWorked int64
Overtime int64
MostRecentPerfEval int64
TotalWorkingYears int64
YearsAtCompany int64
YearsInCurrentRole int64
YearsSinceLastPromotion int64
YearsWithCurrManager int64
training int64


['BusinessTravel',
 'Department',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'JobLevel']

In [4]:
df_final = pd.get_dummies(df, columns = cat_vars, drop_first=True)
df_final.head()

Unnamed: 0,ID,Age,Attrition,DistanceFromHome,Education,MonthlyIncome,NumCompaniesWorked,Overtime,MostRecentPerfEval,TotalWorkingYears,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,JobLevel_2,JobLevel_3,JobLevel_4,JobLevel_5
0,100774131,35,No,1,3,2859,1,0,6,6,...,False,True,False,False,False,True,False,False,False,False
1,101330756,30,No,8,2,11416,0,1,5,9,...,True,False,False,False,True,False,False,True,False,False
2,101910614,27,Yes,22,3,2863,1,0,4,1,...,False,False,False,False,True,False,False,False,False,False
3,102548783,39,No,18,2,4534,0,0,6,9,...,False,False,False,False,False,True,True,False,False,False
4,103053549,36,No,5,2,5914,8,0,5,16,...,False,False,False,False,False,True,True,False,False,False


In [5]:
df_train = df_final[df_final.training == 1]
df_test = df_final[df_final.training == 0]

print(df_train.shape, df_test.shape)

(1170, 34) (300, 34)


In [6]:
df_train.Attrition.value_counts()

Attrition
No     985
Yes    185
Name: count, dtype: int64

In [7]:
df_test.Attrition.value_counts()

Attrition
No     248
Yes     52
Name: count, dtype: int64

In [8]:
df_test_copy = df_test.copy()

In [9]:
df_final.columns

Index(['ID', 'Age', 'Attrition', 'DistanceFromHome', 'Education',
       'MonthlyIncome', 'NumCompaniesWorked', 'Overtime', 'MostRecentPerfEval',
       'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'training',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Research & Development', 'Department_Sales', 'Gender_Male',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Married', 'MaritalStatus_Single', 'JobLevel_2',
       'JobLevel_3', 'JobLevel_4', 'JobLevel_5'],
      dtype='object')

In [10]:
X_train = df_train.drop(columns=['ID', 'Attrition', 'training'], axis=1)
y_train = df_train['Attrition']

X_test = df_test.drop(columns=['ID', 'Attrition', 'training'], axis=1)
y_test = df_test['Attrition']

In [11]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)

# Create a dictionary mapping class labels to their respective weights
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
class_weight_dict

{0: 0.5939086294416244, 1: 3.1621621621621623}

In [12]:
class_weight_dict = {'No' :  0.5939086294416244, 'Yes' : 3.1621621621621623}

In [47]:
lr = LogisticRegression(penalty='l2', class_weight=class_weight_dict)
# 
lr.fit(X_train, y_train)

In [48]:
y_pred = lr.predict_proba(X_test)[:,1]

In [49]:
df_test_copy['pred'] = y_pred

In [50]:
pred_df = df_test_copy[['ID', 'Attrition', 'pred']]

In [51]:
pred_df.head()

Unnamed: 0,ID,Attrition,pred
0,100774131,No,0.346211
10,111394395,No,0.322466
14,112858667,No,0.487446
17,113394055,No,0.208048
22,117129049,No,0.094617


In [52]:
pred_df.Attrition.value_counts()

Attrition
No     248
Yes     52
Name: count, dtype: int64

In [53]:
pred_df = pred_df.sort_values(by='pred', ascending=False)

In [54]:
pred_df.head()

Unnamed: 0,ID,Attrition,pred
229,242290168,Yes,0.914259
903,660629240,No,0.857928
592,457372479,No,0.852494
1431,978765582,Yes,0.85025
887,651020637,Yes,0.848291


In [56]:
pred_df.reset_index()

Unnamed: 0,index,ID,Attrition,pred
0,229,242290168,Yes,0.914259
1,903,660629240,No,0.857928
2,592,457372479,No,0.852494
3,1431,978765582,Yes,0.850250
4,887,651020637,Yes,0.848291
...,...,...,...,...
295,707,532353412,No,0.036060
296,218,235609727,No,0.031214
297,267,268229877,No,0.024698
298,974,696742078,No,0.024323


In [57]:
batch_size = 30
cum_true_values = 0
cum_pred_values = 0

decile_list = []
total_true_list = []
cum_true_list = []
base_line_list = []
base_line = 0


for idx in range(0, pred_df.shape[0], batch_size):
    start_idx = idx
    end_idx = min(start_idx + batch_size, pred_df.shape[0])
    batch_df = pred_df.iloc[start_idx:end_idx,:]

    gt = batch_df[batch_df.Attrition == 'Yes'].shape[0]

    base_line+=5.2
    base_line_list.append(base_line)
    
    decile_list.append(idx)
    total_true_list.append(gt)

    cum_true_values+= gt
    cum_true_list.append(cum_true_values)

In [58]:
result = pd.DataFrame(
    {
        "decile" : decile_list,
        "Total GT Churned" : total_true_list,
        "Cum GT Churned" : cum_true_list,
        "Base Line" : base_line_list
    }
)

In [59]:
result

Unnamed: 0,decile,Total GT Churned,Cum GT Churned,Base Line
0,0,17,17,5.2
1,30,6,23,10.4
2,60,4,27,15.6
3,90,5,32,20.8
4,120,4,36,26.0
5,150,2,38,31.2
6,180,2,40,36.4
7,210,6,46,41.6
8,240,4,50,46.8
9,270,2,52,52.0


In [60]:
sum(result['Cum GT Churned'] - result['Base Line'])

75.0