In [8]:
import joblib
import pandas as pd

In [9]:
df = pd.read_csv("clean-employee-data.csv")
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,37,1,Travel_Rarely,1141,Research & Development,11,2,Medical,1,Female,...,3,1,0,15,2,1,1,0,0,0
1,51,1,Travel_Rarely,1323,Research & Development,4,4,Life Sciences,1,Male,...,3,3,3,18,2,4,10,0,2,7
2,42,0,Travel_Frequently,555,Sales,26,3,Marketing,3,Female,...,3,4,1,23,2,4,20,4,4,8
3,40,0,Travel_Rarely,1124,Sales,1,2,Medical,2,Male,...,4,3,3,6,2,2,4,3,0,2
4,55,1,Travel_Rarely,725,Research & Development,2,3,Medical,4,Male,...,3,4,1,24,2,3,5,2,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,28,1,Non-Travel,1366,Research & Development,24,2,Technical Degree,2,Male,...,3,1,0,10,2,2,10,7,1,9
1054,38,0,Travel_Rarely,168,Research & Development,1,3,Life Sciences,3,Female,...,3,4,0,10,4,4,1,0,0,0
1055,28,1,Travel_Rarely,1485,Research & Development,12,1,Life Sciences,3,Female,...,3,4,0,1,4,2,1,1,0,0
1056,40,0,Non-Travel,458,Research & Development,16,2,Life Sciences,3,Male,...,3,2,1,6,0,3,4,2,0,0


In [10]:
model = joblib.load("model/logistic_mode_gridsearch.pkl")
scaler = joblib.load("model/scaler.pkl")

In [11]:
current_employees = df[df['Attrition'] == 0].copy()
current_employees.reset_index(inplace=True, drop=True)

selected_columns = [
    "Attrition", "Age", "BusinessTravel", "DistanceFromHome", "EnvironmentSatisfaction",
    "JobInvolvement", "JobSatisfaction", "MonthlyIncome", "OverTime", "RelationshipSatisfaction", "WorkLifeBalance",
    "PerformanceRating", "EducationField", "NumCompaniesWorked", "JobRole", "TotalWorkingYears", "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager"
]
predict_df = current_employees[selected_columns].copy()

target = "Attrition"

nominal_feature = ["BusinessTravel", "OverTime", "EducationField", "JobRole"]

numerical_feature = predict_df.select_dtypes(
    include=['int64']).columns.drop(target)

predict_df = pd.get_dummies(predict_df, columns=nominal_feature, drop_first=True)

predict_df[numerical_feature] = scaler.fit_transform(predict_df[numerical_feature])
X = predict_df.drop(columns=target)

In [12]:
current_employees.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,42,0,Travel_Frequently,555,Sales,26,3,Marketing,3,Female,...,3,4,1,23,2,4,20,4,4,8
1,40,0,Travel_Rarely,1124,Sales,1,2,Medical,2,Male,...,4,3,3,6,2,2,4,3,0,2
2,36,0,Travel_Frequently,635,Research & Development,18,1,Medical,2,Female,...,3,1,0,8,2,3,8,1,1,7
3,32,0,Travel_Rarely,1018,Research & Development,3,2,Life Sciences,3,Female,...,3,4,0,10,6,3,7,7,7,7
4,25,0,Travel_Rarely,583,Sales,4,1,Marketing,3,Male,...,3,1,0,5,1,4,5,2,0,3


In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 30 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               879 non-null    float64
 1   DistanceFromHome                  879 non-null    float64
 2   EnvironmentSatisfaction           879 non-null    float64
 3   JobInvolvement                    879 non-null    float64
 4   JobSatisfaction                   879 non-null    float64
 5   MonthlyIncome                     879 non-null    float64
 6   RelationshipSatisfaction          879 non-null    float64
 7   WorkLifeBalance                   879 non-null    float64
 8   PerformanceRating                 879 non-null    float64
 9   NumCompaniesWorked                879 non-null    float64
 10  TotalWorkingYears                 879 non-null    float64
 11  YearsInCurrentRole                879 non-null    float64
 12  YearsSin

## Predict the likely hood of current employee leaving the company

In [14]:
predicted_attrition_risk = model.predict(X)
proba = model.predict_proba(X)[:, 1]
current_employees['PredictedAttrition'] = predicted_attrition_risk
current_employees['AttritionProbability'] = proba
current_employees.sort_values(by='AttritionProbability', ascending=False).head(10)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,PredictedAttrition,AttritionProbability
15,39,0,Travel_Rarely,412,Research & Development,13,4,Medical,3,Female,...,2,21,4,3,19,9,15,2,1,0.927783
511,35,0,Travel_Rarely,1142,Research & Development,23,4,Medical,3,Female,...,1,4,3,3,2,2,2,2,1,0.909592
708,26,0,Travel_Frequently,575,Research & Development,1,2,Life Sciences,1,Female,...,1,5,2,3,2,2,2,0,1,0.889251
418,22,0,Travel_Rarely,217,Research & Development,8,1,Life Sciences,2,Male,...,1,4,3,2,4,3,1,1,1,0.883872
347,36,0,Travel_Rarely,852,Research & Development,5,4,Life Sciences,2,Female,...,1,6,3,4,1,1,0,0,1,0.848109
208,26,0,Travel_Rarely,1443,Sales,23,3,Marketing,3,Female,...,1,5,2,2,2,2,0,0,1,0.831893
674,28,0,Travel_Rarely,760,Sales,2,4,Marketing,2,Female,...,0,8,2,3,8,7,7,5,1,0.831026
372,26,0,Travel_Rarely,192,Research & Development,1,2,Medical,1,Male,...,2,6,2,3,5,3,1,3,1,0.809511
556,27,0,Travel_Rarely,1103,Research & Development,14,3,Life Sciences,1,Male,...,2,9,3,2,9,7,6,8,1,0.806531
534,28,0,Travel_Frequently,193,Research & Development,2,3,Life Sciences,4,Male,...,1,2,2,3,2,2,2,2,1,0.80608
