In [1]:
# !pip install --upgrade --force-reinstall scikit-learn imbalanced-learn
# !uv pip install -q scikit-learn==1.6.1 imblearn --system


In [2]:
import pandas as pd

df = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
print("df.shape: ",df.shape)
print("df.columns: ", df.dtypes)
print("Missing value percentage: ", df.isna().mean().sort_values(ascending=False))
print("Target dist : ", df['Attrition'].value_counts(normalize=True))

df.shape:  (1470, 35)
df.columns:  Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel    

In [4]:
df['AttritionFlag']=df['Attrition'].map({"Yes": 1, "No": 0})

In [5]:
X = df.drop(columns = ['Attrition', 'AttritionFlag','EmployeeNumber','EmployeeCount','Over18','StandardHours'])
y = df['AttritionFlag']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 56)
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
print(numeric_features)
print(categorical_features)

['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features),
    ]
)


In [8]:
#build baseline model
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(max_iter=1000,class_weight='balanced'))
])

In [9]:
model.fit(X_train, y_train)

In [10]:
#evaluate
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.73      0.82       247
           1       0.34      0.74      0.47        47

    accuracy                           0.73       294
   macro avg       0.64      0.74      0.64       294
weighted avg       0.84      0.73      0.76       294

[[180  67]
 [ 12  35]]


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        class_weight='balanced',
        random_state=56

    ))
])
rf_model.fit(X_train,y_train)

from sklearn.metrics import classification_report,confusion_matrix
y_pred = rf_model.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.99      0.91       247
           1       0.50      0.06      0.11        47

    accuracy                           0.84       294
   macro avg       0.67      0.53      0.51       294
weighted avg       0.79      0.84      0.78       294

[[244   3]
 [ 44   3]]


In [12]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

#pos/neg for weight imbalance
pos_weight = (len(y_train[y_train == 0]) / len(y_train[y_train == 1]))

xgb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.9,
        scale_pos_weight=pos_weight,
        random_state=56,
        eval_metric='logloss'
    ))
])
xgb_model.fit(X_train,y_train)
from sklearn.metrics import classification_report,confusion_matrix
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.90      0.92      0.91       247
           1       0.55      0.49      0.52        47

    accuracy                           0.85       294
   macro avg       0.73      0.71      0.72       294
weighted avg       0.85      0.85      0.85       294

[[228  19]
 [ 24  23]]


In [13]:
#let us find all employees that have a possiblity of leaving with a lesser threshold
from sklearn.metrics import f1_score
y_prob = xgb_model.predict_proba(X_test)[:,1]
print(y_prob)

[0.0472689  0.9382067  0.06637208 0.12006996 0.05589559 0.03380692
 0.40876472 0.21096168 0.28522938 0.04171396 0.05351887 0.0690535
 0.16133636 0.20083152 0.04367978 0.7673121  0.03830676 0.29252413
 0.00683618 0.16887642 0.021717   0.12752485 0.02244208 0.4812251
 0.0078185  0.00646103 0.7799678  0.03420097 0.5484652  0.33955103
 0.17288955 0.75705093 0.24050458 0.08183341 0.10380613 0.5422693
 0.01460036 0.01748143 0.01975204 0.64938825 0.19000754 0.0148187
 0.04712299 0.01666379 0.01514397 0.41113767 0.04957248 0.23543856
 0.18360436 0.12602992 0.00847016 0.08501711 0.0466707  0.16701064
 0.9092972  0.0257664  0.9002805  0.09055988 0.06455475 0.00730942
 0.09881783 0.25483733 0.2897125  0.3367728  0.06365237 0.78324026
 0.00677259 0.00455685 0.47189328 0.20102601 0.08619338 0.25758854
 0.12634116 0.02987465 0.00615316 0.28939015 0.28254184 0.04443769
 0.8048002  0.00130951 0.04076014 0.83976865 0.00261815 0.1129321
 0.12051503 0.01727488 0.0156345  0.20577165 0.0929786  0.31450427


In [14]:
import numpy as np
thresholds = np.arange(0.1,0.91,0.01)
f1_scores = []
for t in thresholds:
    y_pred_t = (y_prob >=t ).astype(int)
    f1_scores.append(f1_score(y_test,y_pred_t))
best_t = thresholds[np.argmax(f1_scores)]
best_t,max(f1_scores)

(0.48999999999999977, 0.5168539325842697)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred_best = (y_prob >= best_t).astype(int)

print("Best threshold:", best_t)
print(classification_report(y_test, y_pred_best))
print(confusion_matrix(y_test, y_pred_best))


Best threshold: 0.48999999999999977
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       247
           1       0.55      0.49      0.52        47

    accuracy                           0.85       294
   macro avg       0.73      0.71      0.72       294
weighted avg       0.85      0.85      0.85       294

[[228  19]
 [ 24  23]]


In [16]:
!pip install imbalanced-learn==0.10.1

Collecting imbalanced-learn==0.10.1
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.13.0
    Uninstalling imbalanced-learn-0.13.0:
      Successfully uninstalled imbalanced-learn-0.13.0
Successfully installed imbalanced-learn-0.10.1


In [17]:
#SMOTE - oversampling minority class
from imblearn.over_sampling import SMOTENC

categorical_mask = X_train.dtypes == 'object'
categorical_indices = np.where(categorical_mask)[0]

smote = SMOTENC(categorical_features=categorical_indices, random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("BEFORE SMOTE:", y_train.value_counts())
print("AFTER SMOTE:", y_train_res.value_counts())

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

xgb_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        random_state=42,
        eval_metric="logloss"
    ))
])

xgb_model.fit(X_train_res, y_train_res)
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


BEFORE SMOTE: AttritionFlag
0    986
1    190
Name: count, dtype: int64
AFTER SMOTE: AttritionFlag
0    986
1    986
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       247
           1       0.54      0.45      0.49        47

    accuracy                           0.85       294
   macro avg       0.72      0.69      0.70       294
weighted avg       0.84      0.85      0.84       294

[[229  18]
 [ 26  21]]


In [18]:
#Inference : IBM HR Attrition data is low signal for actual attrition.
#Likely features for attrition might be personal reasons, burnout & satisfaction.