In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [3]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
factors = df.drop(['Attrition', 'BusinessTravel', 'Department',
                   'EducationField', 'EmployeeCount', 'EmployeeNumber',
                  'Gender', 'JobRole', 'MaritalStatus', 'Over18',
                  'OverTime', 'HourlyRate', 'MonthlyRate',
                  'PerformanceRating', 'StandardHours'], axis=1
                 )

In [5]:
factors['BusinessTravelFreq'] = np.where(df['BusinessTravel']=='Non-Travel', 0, 
         (np.where(df['BusinessTravel']=='Travel_Frequently', 1, 0.5)))

In [6]:
factors['OverTime'] = np.where(df['OverTime']=='Yes', 1, 0)

In [7]:
dept = pd.get_dummies(df['Department'], prefix='Dept')

In [8]:
ef = pd.get_dummies(df['EducationField'], prefix='Degree')

In [9]:
jr = pd.get_dummies(df['JobRole'], prefix='Role')

In [10]:
ms = pd.get_dummies(df['MaritalStatus'])

In [11]:
factors = pd.concat([factors, dept, ef, jr, ms], axis=1)

In [12]:
df['Retained'] = np.where(df['Attrition']=='No', 1, 0)

# Logistic Regression without Regularization

In [13]:
# Declare a logistic regression classifier
lr = LogisticRegression()
y = df['Retained']
X = factors

# Fit the model
fit = lr.fit(X, y)

# Display
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('Accuracy by retention status')
print(pd.crosstab(pred_y_sklearn, y))

print('Percentage accuracy')
print(lr.score(X, y))

Coefficients
[[ 1.64840985e-02  1.83368251e-04 -4.31866452e-02 -1.92322736e-02
   3.64957452e-01  3.81916406e-01 -1.04345572e-02  3.52763708e-01
   4.57088106e-05 -1.85963478e-01 -1.53440070e-02  1.67274529e-01
   1.21058002e-01  7.32014741e-02  1.08216255e-01  2.03974001e-01
  -9.59979795e-02  1.33096314e-01 -1.63468784e-01  1.31035636e-01
  -1.59534833e+00 -1.80051625e+00 -2.60525150e-01  7.81396656e-03
  -5.23299709e-01 -2.83223499e-01  1.15589751e-01 -2.49768483e-01
   2.16071907e-01  1.52259957e-01 -7.26940525e-01  3.35741597e-01
  -3.32106881e-01 -8.89258810e-01  7.26747379e-02  2.77299431e-01
   2.90963336e-01  6.47693352e-02  4.53705958e-02 -6.41464234e-01
   2.21288382e-01 -1.30382635e-01 -8.66916638e-01]]
[-0.77601089]
Accuracy by retention status
Retained    0     1
row_0              
0          97    18
1         140  1215
Percentage accuracy
0.8925170068027211


# Ridge Logistic Regression

In [14]:
# Adding L2 Regularization
lr2 = LogisticRegression(penalty='l2', C=.41)

# Fit the model
fit = lr2.fit(X, y)

# Display
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr2.predict(X)

print('Accuracy by retention status')
print(pd.crosstab(pred_y_sklearn, y))

print('Percentage accuracy')
print(lr2.score(X, y))

Coefficients
[[ 1.22116197e-02  1.40672787e-04 -4.10989424e-02 -7.33065474e-02
   2.99746546e-01  3.21818016e-01  2.55498158e-02  2.60722676e-01
   6.02213141e-05 -1.70099227e-01 -3.09359999e-02  1.57759898e-01
   2.35085859e-01  7.06965582e-02  1.41316541e-01  1.21819234e-01
  -1.12264446e-01  1.56139406e-01 -1.41880241e-01  1.21345076e-01
  -1.06976507e+00 -1.68359370e+00 -1.36112087e-01  9.63922053e-02
  -3.96633034e-01 -1.55599334e-01  1.14417363e-01 -2.14529544e-01
   2.25997896e-01  9.00849786e-02 -4.96724275e-01  2.52463405e-01
  -1.71037490e-01 -6.48722865e-01  3.68195977e-02  2.10845431e-01
   1.45630488e-01  1.62183883e-01  2.55459512e-02 -4.50081316e-01
   2.36248336e-01  2.10392166e-02 -6.93640468e-01]]
[-0.43635292]
Accuracy by retention status
Retained    0     1
row_0              
0          97    19
1         140  1214
Percentage accuracy
0.8918367346938776


# Lasso Logistic Regression

In [15]:
# Switching to L1 Regularization
lr1 = LogisticRegression(penalty='l1', C=.5)

# Fit the model
fit = lr1.fit(X, y)

# Display
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr1.predict(X)

print('Accuracy by retention status')
print(pd.crosstab(pred_y_sklearn, y))

print('Percentage accuracy')
print(lr1.score(X, y))

Coefficients
[[ 2.03062859e-02  2.10262934e-04 -4.21353100e-02 -1.76060124e-02
   3.56765720e-01  4.02137438e-01  0.00000000e+00  3.32180161e-01
   5.96347678e-05 -1.76548848e-01 -6.88561694e-03  1.81991980e-01
   1.81816425e-01  6.29362147e-02  1.48375398e-01  2.18976537e-01
  -8.32983758e-02  1.25256222e-01 -1.58103234e-01  1.25348161e-01
  -1.61214092e+00 -1.77437935e+00  0.00000000e+00  0.00000000e+00
  -6.85423210e-01 -5.33862134e-02  0.00000000e+00 -2.52479459e-01
   7.70272568e-02  0.00000000e+00 -7.27452476e-01  2.69221087e-02
  -7.27790007e-01 -8.87498042e-01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00 -5.45982964e-01
   0.00000000e+00 -1.86722781e-01 -9.56600171e-01]]
[-1.03951317]
Accuracy by retention status
Retained    0     1
row_0              
0          98    16
1         139  1217
Percentage accuracy
0.8945578231292517


# Cross Validation

In [16]:
# Validating logistic regression without regularization
print('Vanilla Logistic Regression')
print(cross_val_score(lr, X, y, cv=10))
# Validating ridge regression model
print('Ridge Regression')
print(cross_val_score(lr2, X, y, cv=10))
# Validating lasso regression model
print('Lasso Regression')
print(cross_val_score(lr1, X, y, cv=10))

Vanilla Logistic Regression
[0.90540541 0.84459459 0.89189189 0.89115646 0.87755102 0.84353741
 0.89795918 0.88356164 0.88356164 0.89726027]
Ridge Regression
[0.91216216 0.85135135 0.88513514 0.88435374 0.88435374 0.85034014
 0.89795918 0.8630137  0.88356164 0.89726027]
Lasso Regression
[0.91216216 0.85810811 0.89189189 0.89115646 0.8707483  0.85714286
 0.89795918 0.89726027 0.90410959 0.89041096]


# Conclusions

The Lasso logistic regression was the most accurate.  It showed 89.5% accuracy, compared to 89.2% accuracy for the Ridge regression model and 89.3% accuracy for the logistic regression model without any regularization.  The lasso logistic regression model also performs best when it comes to the cross validation.  I ran the models with fewer features, and the difference in accuracy between the models was much greater, with the lasso logistic regression model far outperforming the other two.

I used most of the available variables to form features for the model.  Some were redundant and were left out, as hourly rate and monthly rate are almost perfectly correlated.  I created dummy variables to use the categorical data such as department.

One strength of logistic regression modeling is that it is easy to implement.  Regression can be limited as a model in situations where data is noisy.  In this case, a regression model was more accurate than the random forest model I previously used on the same data set.