In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
#'Age','Attrition','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
#'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
#'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'

features=['Age','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

features=['Age','DistanceFromHome','EnvironmentSatisfaction','JobSatisfaction','MaritalStatus',
         'MonthlyIncome','NumCompaniesWorked','TotalWorkingYears','TrainingTimesLastYear',
          'WorkLifeBalance','YearsSinceLastPromotion']


# features=['TrainingTimesLastYear',
# 'PerformanceRating',
# 'Age',
# 'DistanceFromHome',
# 'StockOptionLevel',
# 'JobSatisfaction',
# 'MaritalStatus']

In [4]:
#select features
data=df[features]
data_binary_encoded = pd.get_dummies(data)
X=data_binary_encoded
y=df["Attrition"].replace({'Yes': 1, 'No': 0})


In [5]:
#Splitting and scaling the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [6]:
#Train the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model = model.fit(X_train_scaled, y_train)
print(f"Training score: {model.score(X_train_scaled,y_train)}")
print(f"Test score: {model.score(X_test_scaled,y_test)}")

Training score: 0.852994555353902
Test score: 0.8179347826086957


In [7]:
model.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']}
]
grid = GridSearchCV(LogisticRegression(), param_grid, verbose=3)
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END .........C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END .........C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=0.00026366508987303583, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END C=0.00026366508987303583, p

[CV 1/5] END C=11.288378916846883, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END C=11.288378916846883, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END C=11.288378916846883, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END C=11.288378916846883, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END C=11.288378916846883, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=29.763514416313132, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END C=29.763514416313132, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END C=29.763514416313132, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END C=29.763514416313132, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END C=29.763514416313132, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END C=29.763514416313132, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END C=29.763514416313132, penalty=l2, solver=liblinear; total time=

GridSearchCV(estimator=LogisticRegression(),
             param_grid=[{'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                          'penalty': ['l1', 'l2'], 'solver': ['liblinear']}],
             verbose=3)

In [9]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 4.281332398719396, 'penalty': 'l2', 'solver': 'liblinear'}
0.8566227889757302


In [10]:
model2=grid.best_estimator_
weights=model2.coef_
weights

array([[-0.99961307,  1.05316455, -0.88876895, -0.71000118, -0.95879132,
         1.07191102, -2.35407398, -0.88032073, -1.03270723,  1.15524035,
        -0.28996789, -0.14465304,  0.79022635]])

In [11]:
for i, name in enumerate(X_train.columns):
    print(f"{name}: \t {np.absolute(weights[0][i])}")

Age: 	 0.9996130723746832
DistanceFromHome: 	 1.0531645548894515
EnvironmentSatisfaction: 	 0.8887689450242287
JobSatisfaction: 	 0.710001178209924
MonthlyIncome: 	 0.9587913198756391
NumCompaniesWorked: 	 1.0719110165105452
TotalWorkingYears: 	 2.354073980097105
TrainingTimesLastYear: 	 0.8803207286394817
WorkLifeBalance: 	 1.0327072324569309
YearsSinceLastPromotion: 	 1.155240353674543
MaritalStatus_Divorced: 	 0.2899678905909735
MaritalStatus_Married: 	 0.1446530435596339
MaritalStatus_Single: 	 0.7902263497932621


In [12]:

features=['TrainingTimesLastYear',
'PerformanceRating',
'Age',
'DistanceFromHome',
'StockOptionLevel',
'JobSatisfaction',
'MaritalStatus']


Age
DistanceFromHome
EnvironmentSatisfaction
JobSatisfaction
MonthlyIncome
NumCompaniesWorked
TotalWorkingYears
TrainingTimesLastYear
WorkLifeBalance
YearsSinceLastPromotion

Age
DistanceFromHome
TrainingTimesLastYear
YearsSinceLastPromotion
JobSatisfaction
MaritalStatus




NameError: name 'Age' is not defined