In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df['PromotionRatio']=df['YearsSinceLastPromotion']/df['YearsAtCompany']
df['PromotionRatio'] = df['PromotionRatio'].fillna(0)
df['PromotionRatio'].replace(np.inf,0)

df['ManagerRatio']=df['YearsWithCurrManager']/df['YearsAtCompany']
df['ManagerRatio'] = df['ManagerRatio'].fillna(0)
df['ManagerRatio'].replace(np.inf,0)

df['CompanyRatio']=df['YearsAtCompany']/df['TotalWorkingYears']
df['CompanyRatio'] = df['CompanyRatio'].fillna(0)
df['CompanyRatio'].replace(np.inf,0)

df['SpecificIncome']=df['MonthlyIncome']/np.mean(df['MonthlyIncome'])
df['SpecificIncome'] = df['SpecificIncome'].fillna(0)


df['Role-Ratio']=df['YearsInCurrentRole']/df['YearsAtCompany']
df['Role-Ratio'] = df['Role-Ratio'].fillna(0)
df['Role-Ratio'] = df['Role-Ratio'].replace(np.inf,0)

df['Role-Manager-Ratio']=df['YearsInCurrentRole']/df['YearsWithCurrManager']
df['Role-Manager-Ratio'] = df['Role-Manager-Ratio'].fillna(0)
df['Role-Manager-Ratio']= df['Role-Manager-Ratio'].replace(np.inf,0)

df['Yearspercompany']=(df['TotalWorkingYears']/df['NumCompaniesWorked']+1)
df['Yearspercompany'] = df['Yearspercompany'].fillna(0)
df['Yearspercompany'] = df['Yearspercompany'].replace(np.inf,0)

#df = df.dropna()
# Adding new variables
df['Time_in_each_comp'] = (df['Age']-20) / ((df)['NumCompaniesWorked']+1)
df['TotalSatisfaction_mean'] = (df['RelationshipSatisfaction']  + df['EnvironmentSatisfaction'] + df['JobSatisfaction'] + df['JobInvolvement'] + df['WorkLifeBalance'])/5
df['Income_YearsComp'] = df['MonthlyIncome'] / df['YearsAtCompany']
df['Income_YearsComp'] = df['Income_YearsComp'].replace(np.Inf, 0)
df['Income_Distance'] = df['MonthlyIncome'] / df['DistanceFromHome']
df['Fidelity'] = (df['NumCompaniesWorked']) / df['TotalWorkingYears']
df['Fidelity'] = df['Fidelity'].replace(np.Inf, 0)
df['Stability'] = df['YearsInCurrentRole'] / df['YearsAtCompany']
df['Stability'].fillna((df['Stability'].mean()), inplace=True)
df['Hrate_Mrate'] = df['HourlyRate'] / df['MonthlyRate']
def SalesDpt(df) :
    if df['Department'] == 'Sales':
        return 1
    else:
        return 0
df['SalesDpt'] = df.apply(lambda df:SalesDpt(df) ,axis = 1)

df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'PromotionRatio', 'ManagerRatio',
       'CompanyRatio', 'SpecificIncome', 'Role-Ratio', 'Role-Manager-Ratio',
       'Yearspercompany', 'Time_in_each_comp', 'TotalSatisfaction_mean',
       'Income_YearsComp', 'Income_Distance', 'Fidelity', 'Stability',
       'Hrate_Mrate', 'SalesDpt'],
      dtype='object')

In [3]:
#'Age','Attrition','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
#'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
#'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'

# features=['Age','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
# 'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
# 'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

features=['Age','DistanceFromHome','EnvironmentSatisfaction','JobSatisfaction','MaritalStatus',
         'MonthlyIncome','NumCompaniesWorked','TotalWorkingYears','TrainingTimesLastYear',
          'WorkLifeBalance','YearsSinceLastPromotion']


# features=['TrainingTimesLastYear',
# 'DistanceFromHome',
# 'JobSatisfaction',
# 'MaritalStatus',
# 'TotalWorkingYears']

features=['JobSatisfaction',
         'DistanceFromHome',
         'WorkLifeBalance',
         'NumCompaniesWorked',
         'EnvironmentSatisfaction',
         'YearsAtCompany','JobLevel','Department'
         ]
#'Hrate_Mrate' removed due to increased performance without it
#'Stability' Removed due to no increased effect from it
features=['OverTime', 'Fidelity',  'SalesDpt', 'MaritalStatus', 'Gender']

In [4]:
#select features
data=df[features]
data_binary_encoded = pd.get_dummies(data)
X=data_binary_encoded
y=df["Attrition"].replace({'Yes': 1, 'No': 0})


In [5]:
#Splitting and scaling the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [6]:
#Train the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model = model.fit(X_train_scaled, y_train)
print(f"Training score: {model.score(X_train_scaled,y_train)}")
print(f"Test score: {model.score(X_test_scaled,y_test)}")

Training score: 0.8629764065335753
Test score: 0.8206521739130435


In [7]:
model.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']}
]
grid = GridSearchCV(LogisticRegression(), param_grid, verbose=3)
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=0.00026366508987303583, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END C=0.00026366508987303583, p

[CV 4/5] END C=29.763514416313132, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END C=29.763514416313132, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END C=29.763514416313132, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END C=29.763514416313132, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END C=29.763514416313132, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END C=29.763514416313132, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END C=29.763514416313132, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=78.47599703514607, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END C=78.47599703514607, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END C=78.47599703514607, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END C=78.47599703514607, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END C=78.47599703514607, penalty=l1, solver=liblinear; total time=   0.

GridSearchCV(estimator=LogisticRegression(),
             param_grid=[{'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                          'penalty': ['l1', 'l2'], 'solver': ['liblinear']}],
             verbose=3)

In [9]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 29.763514416313132, 'penalty': 'l2', 'solver': 'liblinear'}
0.8620526532291238


In [10]:
model2=grid.best_estimator_
weights=model2.coef_
weights

array([[ 3.91724024,  0.66253845, -1.25838087,  0.22612764, -0.85173551,
        -0.55692495,  0.37640723, -0.74500192, -0.2872513 ]])

In [11]:
for i, name in enumerate(X_train.columns):
    print(f"{name}: \t {np.absolute(weights[0][i])}")

Fidelity: 	 3.9172402421674195
SalesDpt: 	 0.66253845316328
OverTime_No: 	 1.2583808673029049
OverTime_Yes: 	 0.22612764210838449
MaritalStatus_Divorced: 	 0.8517355063937327
MaritalStatus_Married: 	 0.5569249516586932
MaritalStatus_Single: 	 0.3764072328579391
Gender_Female: 	 0.74500192240567
Gender_Male: 	 0.2872513027888623


In [12]:
from joblib import dump, load
dump(model2, 'attrition_pred.lrm') 
dump(X_scaler, 'lrm.scaler')


['lrm.scaler']

In [13]:
model3=load('attrition_pred.lrm')
scaler=load('lrm.scaler')
test=model.predict(X)
print(classification_report(y,test))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      1233
           1       0.47      0.43      0.45       237

    accuracy                           0.83      1470
   macro avg       0.68      0.67      0.67      1470
weighted avg       0.82      0.83      0.83      1470



In [14]:
# test={'TrainingTimesLastYear':[2],
# 'DistanceFromHome':[50],
# 'JobSatisfaction':[1],
# 'TotalWorkingYears':[3],
# 'MaritalStatus_Divorced':[0],
# 'MaritalStatus_Married':[0],
# 'MaritalStatus_Single':[1],
# 'MonthlyIncome':[5000]
#      }
# test=pd.DataFrame(test)
# test=scaler.transform(test)
# model3.predict(test)[0]

In [15]:
X.head()

Unnamed: 0,Fidelity,SalesDpt,OverTime_No,OverTime_Yes,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Gender_Female,Gender_Male
0,1.0,1,0,1,0,0,1,1,0
1,0.1,0,1,0,0,1,0,0,1
2,0.857143,0,0,1,0,0,1,0,1
3,0.125,0,0,1,0,1,0,1,0
4,1.5,0,1,0,0,1,0,0,1


In [16]:
testdf=pd.DataFrame({"Gender":["Male","Male"]})
pd.get_dummies(testdf)

Unnamed: 0,Gender_Male
0,1
1,1
