In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("../WA_Fn-UseC_-HR-Employee-Attrition.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
monthlyIncomeBins = [0,2000,4000,6000,8000,10000,20000]
monthlyIncomeLabels = [0,1,2,3,4,5]
df['MonthlyIncomeBins'] = pd.cut(df['MonthlyIncome'], bins=monthlyIncomeBins, labels=monthlyIncomeLabels)
df = df.dropna()
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'MonthlyIncomeBins'],
      dtype='object')

In [3]:
# Adding new variables
df['Time_in_each_comp'] = (df['Age']-20) / ((df)['NumCompaniesWorked']+1)
df['TotalSatisfaction_mean'] = (df['RelationshipSatisfaction']  + df['EnvironmentSatisfaction'] + df['JobSatisfaction'] + df['JobInvolvement'] + df['WorkLifeBalance'])/5
df['Income_YearsComp'] = df['MonthlyIncome'] / df['YearsAtCompany']
df['Income_YearsComp'] = df['Income_YearsComp'].replace(np.Inf, 0)
df['Income_Distance'] = df['MonthlyIncome'] / df['DistanceFromHome']
df['Fidelity'] = (df['NumCompaniesWorked']) / df['TotalWorkingYears']
df['Fidelity'] = df['Fidelity'].replace(np.Inf, 0)
df['Stability'] = df['YearsInCurrentRole'] / df['YearsAtCompany']
df['Stability'].fillna((df['Stability'].mean()), inplace=True)
df['Hrate_Mrate'] = df['HourlyRate'] / df['MonthlyRate']
def SalesDpt(df) :
    if df['Department'] == 'Sales':
        return 1
    else:
        return 0
df['SalesDpt'] = df.apply(lambda df:SalesDpt(df) ,axis = 1)
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'MonthlyIncomeBins', 'Time_in_each_comp',
       'TotalSatisfaction_mean', 'Income_YearsComp', 'Income_Distance',
       'Fidelity', 'Stability', 'Hrate_Mrate', 'SalesDpt'],
      dtype='object')

In [4]:
#'Age','Attrition','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
#'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
#'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'

features=['Gender','JobLevel','JobSatisfaction','MaritalStatus',
'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

features=['Department','JobRole','Gender', 'StockOptionLevel','WorkLifeBalance',
          'Education','JobInvolvement','JobLevel','YearsAtCompany','DistanceFromHome',
          'JobSatisfaction','NumCompaniesWorked','PerformanceRating','MonthlyIncome',
          'YearsInCurrentRole','MaritalStatus']

features=['DistanceFromHome', 'NumCompaniesWorked', 'StockOptionLevel', 'MonthlyIncomeBins', 'YearsAtCompany', 'MaritalStatus', 'YearsInCurrentRole', 'Education', 'Department', 'JobInvolvement']

features = ['OverTime', 'Time_in_each_comp', 'TotalSatisfaction_mean', 'Income_YearsComp', 'Income_Distance',
           'Fidelity', 'DailyRate', 'Stability', 'YearsWithCurrManager', 'StockOptionLevel',
           'Hrate_Mrate', 'HourlyRate', 'MonthlyRate', 'TotalWorkingYears', 'SalesDpt']

features

['OverTime',
 'Time_in_each_comp',
 'TotalSatisfaction_mean',
 'Income_YearsComp',
 'Income_Distance',
 'Fidelity',
 'DailyRate',
 'Stability',
 'YearsWithCurrManager',
 'StockOptionLevel',
 'Hrate_Mrate',
 'HourlyRate',
 'MonthlyRate',
 'TotalWorkingYears',
 'SalesDpt']

In [5]:
#Select features
data=df[features]
data_binary_encoded = pd.get_dummies(data)
X=data_binary_encoded
y=df["Attrition"].replace({'Yes': 1, 'No': 0})
X

Unnamed: 0,Time_in_each_comp,TotalSatisfaction_mean,Income_YearsComp,Income_Distance,Fidelity,DailyRate,Stability,YearsWithCurrManager,StockOptionLevel,Hrate_Mrate,HourlyRate,MonthlyRate,TotalWorkingYears,SalesDpt,OverTime_No,OverTime_Yes
0,2.333333,2.2,998.833333,5993.000000,1.000000,1102,0.666667,5,0,0.004826,94,19479,8,1,0,1
1,14.500000,2.8,513.000000,641.250000,0.100000,279,0.700000,7,1,0.002449,61,24907,10,0,1,0
2,2.428571,2.8,0.000000,1045.000000,0.857143,1373,0.596061,0,0,0.038397,92,2396,7,0,0,1
3,6.500000,3.2,363.625000,969.666667,0.125000,1392,0.875000,0,0,0.002418,56,23159,8,0,0,1
4,0.700000,2.6,1734.000000,1734.000000,1.500000,591,1.000000,2,1,0.002405,40,16632,6,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,3.200000,3.4,514.200000,111.782609,0.235294,884,0.400000,3,1,0.003336,41,12290,17,0,1,0
1466,3.800000,2.2,1427.285714,1665.166667,0.444444,613,1.000000,7,1,0.001957,42,21457,9,0,1,0
1467,3.500000,2.6,1023.666667,1535.500000,0.166667,155,0.333333,3,1,0.016815,87,5174,6,0,0,1
1468,9.666667,2.8,598.888889,2695.000000,0.117647,1023,0.666667,8,0,0.004757,63,13243,17,1,1,0


In [6]:
#Function to create Model
def createModel(X):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    X_scaler = MinMaxScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    model = LogisticRegression()
    model = model.fit(X_train_scaled, y_train)
    trainScore = model.score(X_train_scaled,y_train)
    testScore = model.score(X_test_scaled,y_test)
    test = model.predict(X)
    precisionScore = precision_score(y,test)
    accuracyScore = accuracy_score(y,test)
    f1Score = f1_score(y,test)
    recallScore = recall_score(y,test)
    classRep = classification_report(y,test)
    retList = [trainScore, testScore, precisionScore, accuracyScore, f1Score, recallScore, classRep]
    return retList

createModel(X)

[0.8756805807622504,
 0.8505434782608695,
 0.18639455782312925,
 0.5251700680272109,
 0.28189300411522633,
 0.5780590717299579,
 '              precision    recall  f1-score   support\n\n           0       0.86      0.52      0.65      1233\n           1       0.19      0.58      0.28       237\n\n    accuracy                           0.53      1470\n   macro avg       0.53      0.55      0.46      1470\nweighted avg       0.75      0.53      0.59      1470\n']

In [7]:
# This will create different combinations of features and will iterate through the newModel function. 
minCombinations = 7
newFeatures = []
for i in range(minCombinations, len(features)):
    newFeatures += list(combinations(features, i))

# Uncomment the next line to create a fast example
#newFeatures = list(combinations(features, len(features)-1))
scores = []
counter = 1

for i in newFeatures:
    print(f'Progress {round(counter/len(newFeatures)*100,1)}% {counter}/{len(newFeatures)}')
    counter += 1
    newDF = df.filter(i, axis=1)
    X = pd.get_dummies(newDF)
    retList = createModel(X)
    retList.append(i)
    scores.append(retList)

print("Models processing complete!")

Progress 6.7% 1/15
Progress 13.3% 2/15
Progress 20.0% 3/15
Progress 26.7% 4/15


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Progress 33.3% 5/15
Progress 40.0% 6/15
Progress 46.7% 7/15
Progress 53.3% 8/15
Progress 60.0% 9/15
Progress 66.7% 10/15
Progress 73.3% 11/15
Progress 80.0% 12/15
Progress 86.7% 13/15
Progress 93.3% 14/15
Progress 100.0% 15/15
Models processing complete!


In [8]:
# Converting to Dataframe and Sorting
scoresDF = pd.DataFrame(scores)
scoresDF.columns = ["trainScore", "testScore", "precisionScore","accuracyScore", "f1Score","recallScore", "classificationReport", "Features"]
scoresDF = scoresDF.sort_values(["f1Score"], ascending=False)
scoresDF

Unnamed: 0,trainScore,testScore,precisionScore,accuracyScore,f1Score,recallScore,classificationReport,Features
11,0.874773,0.850543,0.189805,0.44966,0.301984,0.738397,precision recall f1-score ...,"(OverTime, Time_in_each_comp, TotalSatisfactio..."
7,0.874773,0.855978,0.195918,0.534694,0.296296,0.607595,precision recall f1-score ...,"(OverTime, Time_in_each_comp, TotalSatisfactio..."
6,0.875681,0.850543,0.186101,0.47619,0.290976,0.666667,precision recall f1-score ...,"(OverTime, Time_in_each_comp, TotalSatisfactio..."
14,0.862069,0.842391,0.198718,0.582993,0.288037,0.523207,precision recall f1-score ...,"(Time_in_each_comp, TotalSatisfaction_mean, In..."
13,0.875681,0.845109,0.185283,0.512925,0.282565,0.594937,precision recall f1-score ...,"(OverTime, TotalSatisfaction_mean, Income_Year..."
0,0.871143,0.839674,0.179122,0.470748,0.27963,0.637131,precision recall f1-score ...,"(OverTime, Time_in_each_comp, TotalSatisfactio..."
4,0.874773,0.850543,0.179707,0.482313,0.278673,0.620253,precision recall f1-score ...,"(OverTime, Time_in_each_comp, TotalSatisfactio..."
3,0.875681,0.850543,0.1834,0.517007,0.278455,0.578059,precision recall f1-score ...,"(OverTime, Time_in_each_comp, TotalSatisfactio..."
9,0.872958,0.839674,0.180124,0.488435,0.278311,0.611814,precision recall f1-score ...,"(OverTime, Time_in_each_comp, TotalSatisfactio..."
5,0.868421,0.845109,0.178744,0.476871,0.277934,0.624473,precision recall f1-score ...,"(OverTime, Time_in_each_comp, TotalSatisfactio..."


In [9]:
# Uncomment the following line to export dataframe to csv file
scoresDF.to_csv('scoresDF2.csv', index = False, header=True)

In [10]:
# Example of printing out a Classification Report
print(scoresDF.loc[7][7])

('OverTime', 'Time_in_each_comp', 'TotalSatisfaction_mean', 'Income_YearsComp', 'Income_Distance', 'Fidelity', 'DailyRate', 'YearsWithCurrManager', 'StockOptionLevel', 'Hrate_Mrate', 'HourlyRate', 'MonthlyRate', 'TotalWorkingYears', 'SalesDpt')


In [11]:
print(scoresDF.loc[1][7])

('OverTime', 'Time_in_each_comp', 'TotalSatisfaction_mean', 'Income_YearsComp', 'Income_Distance', 'Fidelity', 'DailyRate', 'Stability', 'YearsWithCurrManager', 'StockOptionLevel', 'Hrate_Mrate', 'HourlyRate', 'MonthlyRate', 'SalesDpt')


In [12]:
# Used to calculate amount of combinations
testCombinations = []
for i in range(6, len(features)):
    testCombinations += list(combinations(features, i))

len(testCombinations)

27823

In [13]:
filterDF = pd.read_csv("scoresDF2.csv")
filterDF = filterDF.loc[scoresDF["f1Score"]>.25].loc[scoresDF["precisionScore"]>.2].loc[scoresDF["accuracyScore"]>.2].loc[scoresDF["recallScore"]>.3]
countDict = {} 
for feature in features:
    countDict[feature] = len(filterDF[filterDF['Features'].str.contains(feature)])

sortedDict = sorted(countDict.items(), key=lambda x: x[1], reverse=True)
for i in sortedDict:
    print(i[0], i[1])

OverTime 0
Time_in_each_comp 0
TotalSatisfaction_mean 0
Income_YearsComp 0
Income_Distance 0
Fidelity 0
DailyRate 0
Stability 0
YearsWithCurrManager 0
StockOptionLevel 0
Hrate_Mrate 0
HourlyRate 0
MonthlyRate 0
TotalWorkingYears 0
SalesDpt 0


In [14]:
filterDF

Unnamed: 0,trainScore,testScore,precisionScore,accuracyScore,f1Score,recallScore,classificationReport,Features
