In [1]:
import pandas as pd
import numpy as np
import joblib as jb
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('HR_Attrition_dataset.csv')

In [3]:
sample_df = df.sample(n=2000)

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
sample_df.head()

Unnamed: 0,Employee_ID,Age,Attrition,Business_Travel,Department,Distance_From_Home,Education,Environment_Satisfaction,Gender,Salary,Job_Involvement,Job_Level,Job_Role,Job_Satisfaction,Marital_Status,Number_of_Companies_Worked_previously,Overtime,Salary_Hike_in_percent,Total_working_years_experience,Work_life_balance,No_of_years_worked_at_current_company,No_of_years_in_current_role,Years_since_last_promotion
5289,5290,48,No,Travel Rarely,IT Services,2,Master's,1,Female,188979,3,6,Technician,2,Married,2,No,86,8,4,10,5,0
2429,2430,32,Yes,Travel Rarely,Data Science,25,Graduation,2,Female,126452,1,2,Consultant,2,Divorced,7,No,53,11,1,3,3,2
5336,5337,32,No,No Travel,Cyber Security,13,Master's,5,Male,69337,4,4,Support,3,Divorced,9,Yes,77,19,4,2,8,6
4246,4247,31,No,Travel Rarely,Data Science,32,Degree,4,Male,199067,1,5,QA Analyst,3,Married,5,No,0,11,3,3,6,1
4532,4533,29,No,Travel Frequently,Data Science,42,Master's,5,Male,145633,1,1,QA Analyst,2,Divorced,7,Yes,79,3,5,10,6,0


In [6]:
import numpy as np
import pandas as pd

def generate_synthetic_data(df, n_rows=1000, random_state=42):
    np.random.seed(random_state)
    synthetic = pd.DataFrame()

    for col in df.columns:
        if df[col].dtype in ["int64", "float64"]:
            mean = df[col].mean()
            std = df[col].std()
            synthetic[col] = np.random.normal(mean, std, n_rows)

        else:
            probs = df[col].value_counts(normalize=True)
            synthetic[col] = np.random.choice(
                probs.index, size=n_rows, p=probs.values
            )

    return synthetic

In [7]:
sample_df = generate_synthetic_data(df, 1000, 42)

In [8]:
sample_df.head()

Unnamed: 0,Employee_ID,Age,Attrition,Business_Travel,Department,Distance_From_Home,Education,Environment_Satisfaction,Gender,Salary,Job_Involvement,Job_Level,Job_Role,Job_Satisfaction,Marital_Status,Number_of_Companies_Worked_previously,Overtime,Salary_Hike_in_percent,Total_working_years_experience,Work_life_balance,No_of_years_worked_at_current_company,No_of_years_in_current_role,Years_since_last_promotion
0,6434.461942,58.913634,No,Travel Rarely,Data Science,16.703312,Below College,2.448016,Male,82402.354621,2.048512,6.46319,HR,1.242843,Divorced,11.677723,Yes,-16.888089,0.200546,1.424602,4.587851,6.812741,4.334917
1,4601.345386,52.665204,No,Travel Rarely,Data Science,19.626157,Graduation,1.119383,Female,89467.676245,4.876957,5.911227,Technician,3.184286,Divorced,7.429409,Yes,55.099817,14.296127,5.190207,7.845455,2.891336,6.416419
2,6870.309243,41.279773,No,No Travel,Network Administration,15.569248,Degree,3.465316,Male,118409.153587,4.149552,7.994671,Software Engineer,2.965728,Divorced,7.512338,Yes,-20.729591,8.932827,0.478615,6.197198,9.461051,4.874023
3,9397.328313,31.979725,No,Travel Rarely,Cyber Security,4.828343,Below College,2.615236,Male,130221.251182,1.686453,3.292902,QA Analyst,2.824335,Divorced,5.546809,Yes,81.866985,11.021928,1.578682,2.187415,8.155039,5.940605
4,4324.523634,49.685124,Yes,Travel Rarely,Cyber Security,27.54375,Below College,2.76562,Female,66077.365123,3.423215,8.35798,Help Desk,2.836885,Divorced,3.348504,No,65.364093,4.474222,2.654823,0.78587,5.937967,0.971618


In [9]:
num_cols = sample_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = sample_df.select_dtypes(include=['object', 'category']).columns.tolist()

In [10]:
for col in num_cols:
    sample_df[col] = sample_df[col].astype(int)

In [11]:
sample_df.head(1)

Unnamed: 0,Employee_ID,Age,Attrition,Business_Travel,Department,Distance_From_Home,Education,Environment_Satisfaction,Gender,Salary,Job_Involvement,Job_Level,Job_Role,Job_Satisfaction,Marital_Status,Number_of_Companies_Worked_previously,Overtime,Salary_Hike_in_percent,Total_working_years_experience,Work_life_balance,No_of_years_worked_at_current_company,No_of_years_in_current_role,Years_since_last_promotion
0,6434,58,No,Travel Rarely,Data Science,16,Below College,2,Male,82402,2,6,HR,1,Divorced,11,Yes,-16,0,1,4,6,4


In [12]:
df.head(1)

Unnamed: 0,Employee_ID,Age,Attrition,Business_Travel,Department,Distance_From_Home,Education,Environment_Satisfaction,Gender,Salary,Job_Involvement,Job_Level,Job_Role,Job_Satisfaction,Marital_Status,Number_of_Companies_Worked_previously,Overtime,Salary_Hike_in_percent,Total_working_years_experience,Work_life_balance,No_of_years_worked_at_current_company,No_of_years_in_current_role,Years_since_last_promotion
0,1,56,Yes,Travel Rarely,Network Administration,24,Graduation,5,Male,186630,2,6,Manager,4,Married,7,Yes,85,9,1,7,2,4


In [13]:
df['Attrition'] = df['Attrition'].map({'Yes':1, 'No':0})

In [14]:
from sklearn.model_selection import train_test_split, StratifiedKFold

skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)

In [15]:
X = df.drop(columns=['Employee_ID', 'Attrition'])
y = df[['Attrition']]

In [16]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categoric_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [17]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numerical_transformer = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [18]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numeric_cols),
    ('cat', categorical_transformer, categoric_cols)
], remainder='passthrough')

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report, precision_recall_curve

lg_bal = LogisticRegression(max_iter=2000, 
                            class_weight='balanced', 
                            random_state=42,
                            solver='saga')

dt_bal = DecisionTreeClassifier(class_weight='balanced', random_state=42, criterion="log_loss",
                                max_depth=8, max_leaf_nodes=40, min_samples_leaf=10, min_samples_split=30)

rf_bal = RandomForestClassifier(n_estimators=1000, class_weight='balanced', random_state=42, 
                                criterion='log_loss', max_depth=8, min_samples_split=30, 
                                max_leaf_nodes = 30, max_samples = 0.8,
                                min_samples_leaf = 10)

cat_bal = CatBoostClassifier(iterations=1500, learning_rate=0.03, class_weights=[1, 10], depth=6, random_seed=42,
                             eval_metric='AUC', loss_function='Logloss',
                             cat_features=categoric_cols, verbose=False)

In [20]:
lg_bal_pipe = Pipeline(steps=[
    ('preprocessing', preprocessor), 
    ('lg_bal', lg_bal)
])

dt_bal_pipe = Pipeline(steps=[
    ('preprocessing', preprocessor), 
    ('dt_bal', dt_bal)
])

rf_bal_pipe = Pipeline(steps=[
    ('preprocessing', preprocessor), 
    ('rf_bal', rf_bal)
])

In [21]:
lg_roc_auc_scores = []
lg_pr_auc_scores = []
lg_fpr = []
lg_fnr = []
pr_list = []
re_list =[]

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f'fold {i}')
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    lg_bal_pipe.fit(X_train, y_train.values.ravel())
    lg_prob = lg_bal_pipe.predict_proba(X_test)[:, 1]
    y_pred = (lg_prob >= 0.45).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    lg_fnr.append(fn / (fn + tp))
    lg_fpr.append(fp / (fp + tn))
    lg_roc_auc_scores.append(roc_auc_score(y_test, lg_prob))
    lg_pr_auc_scores.append(average_precision_score(y_test, lg_prob))
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    pr_list.append(precision)
    re_list.append(recall)

fold 0
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 1
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 2
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 3
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 4
(8000, 21) (2000, 21) (8000, 1) (2000, 1)


In [22]:
model_kpis_lg = {
    "Mean false negative rate": np.mean(lg_fnr),
    "Mean false positive rate": np.mean(lg_fpr),
    "Mean ROC-AUC": np.mean(lg_roc_auc_scores),
    "Mean PR-AUC": np.mean(lg_pr_auc_scores),
    "Mean Precision": np.mean(pr_list),
    "Mean Recall": np.mean(re_list),
    "Number of folds": 5,
    "Threshold": 0.45
}

In [23]:
for key, value in model_kpis_lg.items():
    model_kpis_lg[key] = round(value, 2)
print(model_kpis_lg)

{'Mean false negative rate': 0.25, 'Mean false positive rate': 0.39, 'Mean ROC-AUC': 0.75, 'Mean PR-AUC': 0.52, 'Mean Precision': 0.53, 'Mean Recall': 0.58, 'Number of folds': 5, 'Threshold': 0.45}


In [24]:
rf_roc_auc_scores = []
rf_pr_auc_scores = []
rf_fpr = []
rf_fnr = []
pr_list_rf = []
re_list_rf = []

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f'fold {i}')
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    rf_bal_pipe.fit(X_train, y_train.values.ravel())
    rf_prob = rf_bal_pipe.predict_proba(X_test)[:, 1]
    y_pred = (rf_prob >= 0.45).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    rf_fnr.append(fn / (fn + tp))
    rf_fpr.append(fp / (fp + tn))
    rf_roc_auc_scores.append(roc_auc_score(y_test, rf_prob))
    rf_pr_auc_scores.append(average_precision_score(y_test, rf_prob))
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    pr_list_rf.append(precision)
    re_list_rf.append(recall)

fold 0
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 1
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 2
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 3
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 4
(8000, 21) (2000, 21) (8000, 1) (2000, 1)


In [25]:
model_kpis_rf = {
    "Mean false negative rate": np.mean(rf_fnr),
    "Mean false positive rate": np.mean(rf_fpr),
    "Mean ROC-AUC": np.mean(rf_roc_auc_scores),
    "Mean PR-AUC": np.mean(rf_pr_auc_scores),
    "Mean Precision": np.mean(pr_list_rf),
    "Mean Recall": np.mean(re_list_rf),
    "Number of folds": 5,
    "Threshold": 0.45
}

In [26]:
for key, value in model_kpis_rf.items():
    model_kpis_rf[key] = round(value, 2)
print(model_kpis_rf)

{'Mean false negative rate': 0.17, 'Mean false positive rate': 0.38, 'Mean ROC-AUC': 0.83, 'Mean PR-AUC': 0.7, 'Mean Precision': 0.54, 'Mean Recall': 0.61, 'Number of folds': 5, 'Threshold': 0.45}


In [27]:
dt_roc_auc_scores = []
dt_pr_auc_scores = []
dt_fpr = []
dt_fnr = []
dt_pr_list = []
dt_re_list = []

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f'fold {i}')
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    dt_bal_pipe.fit(X_train, y_train.values.ravel())
    dt_prob = dt_bal_pipe.predict_proba(X_test)[:, 1]
    y_pred = (dt_prob >= 0.45).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    dt_fnr.append(fn / (fn + tp))
    dt_fpr.append(fp / (fp + tn))
    dt_roc_auc_scores.append(roc_auc_score(y_test, dt_prob))
    dt_pr_auc_scores.append(average_precision_score(y_test, dt_prob))
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    dt_pr_list.append(precision)
    dt_re_list.append(recall)

fold 0
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 1
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 2
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 3
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 4
(8000, 21) (2000, 21) (8000, 1) (2000, 1)


In [28]:
model_kpis_dt = {
    "Mean false negative rate": np.mean(dt_fnr),
    "Mean false positive rate": np.mean(dt_fpr),
    "Mean ROC-AUC": np.mean(dt_roc_auc_scores),
    "Mean PR-AUC": np.mean(dt_pr_auc_scores),
    "Mean Precision": np.mean(dt_pr_list),
    "Mean Recall": np.mean(dt_re_list),
    "Number of folds": 5,
    "Threshold": 0.45
}

In [29]:
for key, value in model_kpis_dt.items():
    model_kpis_dt[key] = round(value, 2)
print(model_kpis_dt)

{'Mean false negative rate': 0.22, 'Mean false positive rate': 0.35, 'Mean ROC-AUC': 0.82, 'Mean PR-AUC': 0.69, 'Mean Precision': 0.54, 'Mean Recall': 0.59, 'Number of folds': 5, 'Threshold': 0.45}


In [30]:
cat_roc_auc_scores = []
cat_pr_auc_scores = []
cat_fpr = []
cat_fnr = []
cat_pr_list = []
cat_re_list = []

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f'fold {i}')
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    cat_bal.fit(X_train, y_train.values.ravel(), cat_features=categoric_cols)
    cat_prob = cat_bal.predict_proba(X_test)[:, 1]
    y_pred = (cat_prob >= 0.45).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    cat_fnr.append(fn / (fn + tp))
    cat_fpr.append(fp / (fp + tn))
    cat_roc_auc_scores.append(roc_auc_score(y_test, cat_prob))
    cat_pr_auc_scores.append(average_precision_score(y_test, cat_prob))
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    cat_pr_list.append(precision)
    cat_re_list.append(recall)

fold 0
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 1
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 2
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 3
(8000, 21) (2000, 21) (8000, 1) (2000, 1)
fold 4
(8000, 21) (2000, 21) (8000, 1) (2000, 1)


In [31]:
model_kpis_cat = {
    "Mean false negative rate": np.mean(cat_fnr),
    "Mean false positive rate": np.mean(cat_fpr),
    "Mean ROC-AUC": np.mean(cat_roc_auc_scores),
    "Mean PR-AUC": np.mean(cat_pr_auc_scores),
    "Mean Precision": np.mean(cat_pr_list),
    "Mean Recall": np.mean(cat_re_list),
    "Number of folds": 5,
    "Threshold": 0.45
}

In [32]:
for key, value in model_kpis_cat.items():
    model_kpis_cat[key] = round(value, 2)
print(model_kpis_cat)

{'Mean false negative rate': 0.19, 'Mean false positive rate': 0.36, 'Mean ROC-AUC': 0.83, 'Mean PR-AUC': 0.72, 'Mean Precision': 0.54, 'Mean Recall': 0.6, 'Number of folds': 5, 'Threshold': 0.45}


In [33]:
model_kpis_lg['model'] = 'Logistic Regression'
model_kpis_rf['model'] = 'Random Forest'
model_kpis_dt['model'] = 'Decision Tree'
model_kpis_cat['model'] = 'Catboost'

In [34]:
models_kpis = pd.DataFrame([model_kpis_lg, model_kpis_rf, model_kpis_dt, model_kpis_cat])

In [35]:
models_kpis = models_kpis[['model', "Mean false negative rate", "Mean false positive rate",
                           "Mean ROC-AUC", "Mean PR-AUC", "Mean Precision", "Mean Recall",
                           "Number of folds", "Threshold"]]

In [36]:
models_kpis = models_kpis.sort_values(by='Mean false negative rate', ascending=True)

In [37]:
models_kpis

Unnamed: 0,model,Mean false negative rate,Mean false positive rate,Mean ROC-AUC,Mean PR-AUC,Mean Precision,Mean Recall,Number of folds,Threshold
1,Random Forest,0.17,0.38,0.83,0.7,0.54,0.61,5,0.45
3,Catboost,0.19,0.36,0.83,0.72,0.54,0.6,5,0.45
2,Decision Tree,0.22,0.35,0.82,0.69,0.54,0.59,5,0.45
0,Logistic Regression,0.25,0.39,0.75,0.52,0.53,0.58,5,0.45


In [38]:
models_kpis.to_csv('C:/Users/aksha/employee_attrition_project/data/model_KPIs.csv', index=False)