In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, recall_score
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [12]:
df = pd.read_csv("../WA_Fn-UseC_-HR-Employee-Attrition.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [3]:
#'Age','Attrition','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
#'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
#'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'

features=['Gender','JobLevel','JobSatisfaction','MaritalStatus',
'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

features=['Department','JobRole','Gender', 'StockOptionLevel','WorkLifeBalance',
          'Education','JobInvolvement','JobLevel','YearsAtCompany','DistanceFromHome',
          'JobSatisfaction','NumCompaniesWorked','PerformanceRating',
          'MaritalStatus']

#features=['Department', 'JobRole', 'Education', 'NumCompaniesWorked', 'MaritalStatus']

features

['Department',
 'JobRole',
 'Gender',
 'StockOptionLevel',
 'WorkLifeBalance',
 'Education',
 'JobInvolvement',
 'JobLevel',
 'YearsAtCompany',
 'DistanceFromHome',
 'JobSatisfaction',
 'NumCompaniesWorked',
 'PerformanceRating',
 'MaritalStatus']

In [4]:
#Select features
data=df[features]
data_binary_encoded = pd.get_dummies(data)
X=data_binary_encoded
y=df["Attrition"].replace({'Yes': 1, 'No': 0})
X

Unnamed: 0,StockOptionLevel,WorkLifeBalance,Education,JobInvolvement,JobLevel,YearsAtCompany,DistanceFromHome,JobSatisfaction,NumCompaniesWorked,PerformanceRating,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,Gender_Female,Gender_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,0,1,2,3,2,6,1,4,8,3,...,0,0,0,1,0,1,0,0,0,1
1,1,3,1,2,2,10,8,2,1,4,...,0,0,1,0,0,0,1,0,1,0
2,0,3,2,2,1,0,2,3,6,3,...,0,0,0,0,0,0,1,0,0,1
3,0,3,4,3,1,8,3,3,1,3,...,0,0,1,0,0,1,0,0,1,0
4,1,3,1,3,1,2,2,2,9,3,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,1,3,2,4,2,5,23,4,4,3,...,0,0,0,0,0,0,1,0,1,0
1466,1,3,1,2,3,7,6,1,4,3,...,0,0,0,0,0,0,1,0,1,0
1467,1,3,3,4,2,6,4,2,1,4,...,1,0,0,0,0,0,1,0,1,0
1468,0,2,3,2,2,9,2,2,2,3,...,0,0,0,1,0,0,1,0,1,0


In [5]:
#Function to create Model
def createModel(X):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    X_scaler = MinMaxScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    model = LogisticRegression()
    model = model.fit(X_train_scaled, y_train)
    trainScore = model.score(X_train_scaled,y_train)
    testScore = model.score(X_test_scaled,y_test)
    test=model.predict(X)
    cr = classification_report(y,test)
    rs = recall_score(y,test)
    retList = [trainScore, testScore, rs, cr]
    return retList

createModel(X)

[0.8584392014519057,
 0.8070652173913043,
 0.4345991561181435,
 '              precision    recall  f1-score   support\n\n           0       0.88      0.77      0.82      1233\n           1       0.26      0.43      0.33       237\n\n    accuracy                           0.71      1470\n   macro avg       0.57      0.60      0.57      1470\nweighted avg       0.78      0.71      0.74      1470\n']

In [6]:
# This will create different combinations of features and will iterate through the newModel function. 
minCombinations = 5
newFeatures = []
for i in range(minCombinations, len(features)):
    newFeatures += list(combinations(features, i))

# Uncomment the next line to create a fast example
newFeatures = list(combinations(features, len(features)-1))
scores = []
counter = 1

for i in newFeatures:
    print(f'Progress {round(counter/len(newFeatures)*100,1)}% {counter}/{len(newFeatures)}')
    counter += 1
    newDF = df.filter(i, axis=1)
    X = pd.get_dummies(newDF)
    retList = createModel(X)
    retList.append(i)
    scores.append(retList)

print("Models processing complete!")

Progress 7.1% 1/14
Progress 14.3% 2/14
Progress 21.4% 3/14
Progress 28.6% 4/14
Progress 35.7% 5/14


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Progress 42.9% 6/14
Progress 50.0% 7/14
Progress 57.1% 8/14
Progress 64.3% 9/14
Progress 71.4% 10/14
Progress 78.6% 11/14
Progress 85.7% 12/14
Progress 92.9% 13/14
Progress 100.0% 14/14
Models processing complete!


In [7]:
# Converting to Dataframe and Sorting
scoresDF = pd.DataFrame(scores)
scoresDF.columns = ["trainScore", "testScore", "recallScore", "classificationReport", "Features"]
scoresDF = scoresDF.sort_values(["recallScore"], ascending=False)
scoresDF

Unnamed: 0,trainScore,testScore,recallScore,classificationReport,Features
5,0.854809,0.807065,0.56962,precision recall f1-score ...,"(Department, JobRole, Gender, StockOptionLevel..."
9,0.852995,0.815217,0.535865,precision recall f1-score ...,"(Department, JobRole, Gender, StockOptionLevel..."
7,0.854809,0.817935,0.527426,precision recall f1-score ...,"(Department, JobRole, Gender, StockOptionLevel..."
3,0.849365,0.815217,0.468354,precision recall f1-score ...,"(Department, JobRole, Gender, StockOptionLevel..."
1,0.858439,0.807065,0.451477,precision recall f1-score ...,"(Department, JobRole, Gender, StockOptionLevel..."
6,0.857532,0.807065,0.434599,precision recall f1-score ...,"(Department, JobRole, Gender, StockOptionLevel..."
10,0.857532,0.807065,0.434599,precision recall f1-score ...,"(Department, JobRole, Gender, WorkLifeBalance,..."
13,0.857532,0.807065,0.434599,precision recall f1-score ...,"(JobRole, Gender, StockOptionLevel, WorkLifeBa..."
0,0.856624,0.820652,0.42616,precision recall f1-score ...,"(Department, JobRole, Gender, StockOptionLevel..."
8,0.858439,0.809783,0.42616,precision recall f1-score ...,"(Department, JobRole, Gender, StockOptionLevel..."


In [8]:
# Uncomment the following line to export dataframe to csv file
scoresDF.to_csv('scoresDF.csv', index = True, header=True)

In [9]:
# Example of printing out a Classification Report
#print(scoresDF.loc[5][3])

In [10]:
# Used to calculate amount of combinations
testCombinations = []
for i in range(5, len(features)):
    testCombinations += list(combinations(features, i))

len(testCombinations)

14912