In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [5]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df['PromotionRatio']=df['YearsSinceLastPromotion']/df['YearsAtCompany']
df['PromotionRatio'] = df['PromotionRatio'].fillna(0)
df['PromotionRatio'].replace(np.inf,0)

df['ManagerRatio']=df['YearsWithCurrManager']/df['YearsAtCompany']
df['ManagerRatio'] = df['ManagerRatio'].fillna(0)
df['ManagerRatio'].replace(np.inf,0)

df['CompanyRatio']=df['YearsAtCompany']/df['TotalWorkingYears']
df['CompanyRatio'] = df['CompanyRatio'].fillna(0)
df['CompanyRatio'].replace(np.inf,0)

df['SpecificIncome']=df['MonthlyIncome']/np.mean(df['MonthlyIncome'])
df['SpecificIncome'] = df['SpecificIncome'].fillna(0)


df['Role-Ratio']=df['YearsInCurrentRole']/df['YearsAtCompany']
df['Role-Ratio'] = df['Role-Ratio'].fillna(0)
df['Role-Ratio'] = df['Role-Ratio'].replace(np.inf,0)

df['Role-Manager-Ratio']=df['YearsInCurrentRole']/df['YearsWithCurrManager']
df['Role-Manager-Ratio'] = df['Role-Manager-Ratio'].fillna(0)
df['Role-Manager-Ratio']= df['Role-Manager-Ratio'].replace(np.inf,0)

df['Years-per-company']=df['YearsAtCompany']/(df['TotalWorkingYears']/df['NumCompaniesWorked'])
df['Years-per-company'] = df['Years-per-company'].fillna(0)
df['Years-per-company'] = df['Years-per-company'].replace(np.inf,0)

#df = df.dropna()
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'PromotionRatio', 'ManagerRatio',
       'CompanyRatio', 'SpecificIncome', 'Role-Ratio', 'Role-Manager-Ratio',
       'Years-per-company'],
      dtype='object')

In [3]:
#'Age','Attrition','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
#'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
#'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'

# features=['Age','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
# 'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
# 'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

features=['Age','DistanceFromHome','EnvironmentSatisfaction','JobSatisfaction','MaritalStatus',
         'MonthlyIncome','NumCompaniesWorked','TotalWorkingYears','TrainingTimesLastYear',
          'WorkLifeBalance','YearsSinceLastPromotion']


# features=['TrainingTimesLastYear',
# 'DistanceFromHome',
# 'JobSatisfaction',
# 'MaritalStatus',
# 'TotalWorkingYears']

features=['JobSatisfaction',
         'DistanceFromHome',
         'WorkLifeBalance',
         'NumCompaniesWorked',
         'EnvironmentSatisfaction',
         'YearsAtCompany','JobLevel','Department'
         ]

In [4]:
#select features
data=df[features]
data_binary_encoded = pd.get_dummies(data)
X=data_binary_encoded
y=df["Attrition"].replace({'Yes': 1, 'No': 0})


In [5]:
#Splitting and scaling the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [6]:
#Train the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model = model.fit(X_train_scaled, y_train)
print(f"Training score: {model.score(X_train_scaled,y_train)}")
print(f"Test score: {model.score(X_test_scaled,y_test)}")

Training score: 0.8466424682395645
Test score: 0.8152173913043478


In [7]:
model.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']}
]
grid = GridSearchCV(LogisticRegression(), param_grid, verbose=3)
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=0.00026366508987303583, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END C=0.00026366508987303583, p

[CV 5/5] END C=1.623776739188721, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END C=1.623776739188721, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END C=1.623776739188721, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END C=1.623776739188721, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END C=1.623776739188721, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END C=1.623776739188721, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=4.281332398719396, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END C=4.281332398719396, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END C=4.281332398719396, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END C=4.281332398719396, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END C=4.281332398719396, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END C=4.281332398719396, penalty=l2, solver=liblinear; total time=   0.0s
[CV 

GridSearchCV(estimator=LogisticRegression(),
             param_grid=[{'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                          'penalty': ['l1', 'l2'], 'solver': ['liblinear']}],
             verbose=3)

In [9]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 78.47599703514607, 'penalty': 'l1', 'solver': 'liblinear'}
0.8475483340189223


In [10]:
model2=grid.best_estimator_
weights=model2.coef_
weights

array([[-0.60011054,  1.05871799, -1.21352151,  0.66496233, -0.81775341,
        -1.22685024, -2.12542275,  0.        , -0.36936753,  0.52395626]])

In [11]:
for i, name in enumerate(X_train.columns):
    print(f"{name}: \t {np.absolute(weights[0][i])}")

JobSatisfaction: 	 0.6001105426822148
DistanceFromHome: 	 1.0587179851380955
WorkLifeBalance: 	 1.2135215128080348
NumCompaniesWorked: 	 0.664962331745765
EnvironmentSatisfaction: 	 0.8177534141211573
YearsAtCompany: 	 1.2268502410750697
JobLevel: 	 2.1254227548456655
Department_Human Resources: 	 0.0
Department_Research & Development: 	 0.3693675329350766
Department_Sales: 	 0.5239562636886895


In [12]:
from joblib import dump, load
dump(model2, 'attrition_pred.lrm') 
dump(X_scaler, 'lrm.scaler')


['lrm.scaler']

In [13]:
model3=load('attrition_pred.lrm')
scaler=load('lrm.scaler')
test=model.predict(X)
print(classification_report(y,test))

              precision    recall  f1-score   support

           0       0.87      0.80      0.84      1233
           1       0.28      0.39      0.32       237

    accuracy                           0.74      1470
   macro avg       0.57      0.60      0.58      1470
weighted avg       0.78      0.74      0.75      1470



In [14]:
# test={'TrainingTimesLastYear':[2],
# 'DistanceFromHome':[50],
# 'JobSatisfaction':[1],
# 'TotalWorkingYears':[3],
# 'MaritalStatus_Divorced':[0],
# 'MaritalStatus_Married':[0],
# 'MaritalStatus_Single':[1],
# 'MonthlyIncome':[5000]
#      }
# test=pd.DataFrame(test)
# test=scaler.transform(test)
# model3.predict(test)[0]