# **Employee Leave Prediction Using Machine Learning Classification Models**

# Introduction



In [None]:
# This dataset contains information about employees in a company.
# The goal is to build machine learning classification models to predict whether an employee is likely to take a leave or not.
# This assignment will provide us with hands-on experience in data preprocessing, model selection, and performance evaluation using various metrics.


# DatasetDescription

In [None]:
# The dataset contains the following columns:

# Education:
# Joining Year:
# City:
# Payment Tier:
# Age:
# Gender:
# Ever Benched:
# Experience in Current Domain:
# Leave or Not (target column):

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
df = pd.read_csv('/content/Employee.csv')

In [None]:
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1
...,...,...,...,...,...,...,...,...,...
4648,Bachelors,2013,Bangalore,3,26,Female,No,4,0
4649,Masters,2013,Pune,2,37,Male,No,2,1
4650,Masters,2018,New Delhi,3,27,Male,No,5,1
4651,Bachelors,2012,Bangalore,3,30,Male,Yes,2,0


#Data Preprocessing

In [None]:
#a. Handling Missing Values:
df.dropna(inplace=True)  # Drop rows with missing values

In [None]:
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1
...,...,...,...,...,...,...,...,...,...
4648,Bachelors,2013,Bangalore,3,26,Female,No,4,0
4649,Masters,2013,Pune,2,37,Male,No,2,1
4650,Masters,2018,New Delhi,3,27,Male,No,5,1
4651,Bachelors,2012,Bangalore,3,30,Male,Yes,2,0


In [None]:
df_encoded = df.copy()

In [None]:
#b. Encode categorical variables
import pandas as pd

# Assuming 'df' is your DataFrame
categorical_columns = ['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender', 'EverBenched', 'ExperienceInCurrentDomain']

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [None]:
df_encoded

Unnamed: 0,LeaveOrNot,Education_Masters,Education_PHD,JoiningYear_2013,JoiningYear_2014,JoiningYear_2015,JoiningYear_2016,JoiningYear_2017,JoiningYear_2018,City_New Delhi,...,Age_41,Gender_Male,EverBenched_Yes,ExperienceInCurrentDomain_1,ExperienceInCurrentDomain_2,ExperienceInCurrentDomain_3,ExperienceInCurrentDomain_4,ExperienceInCurrentDomain_5,ExperienceInCurrentDomain_6,ExperienceInCurrentDomain_7
0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,1,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,1,1,0,0,0,0,0,1,0,0,...,0,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4648,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4649,1,1,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4650,1,1,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,0,1,0,0
4651,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,0,0,0


In [None]:
#c. Split the encoded dataset into features (X) and the target variable (y)
X_encoded = df_encoded.drop(['LeaveOrNot'], axis=1)
y_encoded = df_encoded['LeaveOrNot']



# Model Selection

In [None]:
# Choose three different classification models:
# Logistic Regression
# K-Nearest Neighbors (K-NN)
# Artificial Neural Network (ANN)

#Model Training and Evaluation and Accuracy

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


SPLITTING THE DATASET INTO TRAIN TEST

In [None]:
# Split the encoded dataset into training and testing sets
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(
    X_encoded, y_encoded, test_size=0.3, random_state=42
)

3 MODELS : LOGISTIC REGRESSION, KNN, ANN

In [None]:
# Model 1: Logistic Regression
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_encoded, y_train_encoded)


# Model 2: K-Nearest Neighbors (K-NN)
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train_encoded, y_train_encoded)

# Train ANN
# Model 3: Artificial Neural Network (ANN)
# Note: You might need to adjust the architecture based on your data
ann_model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_encoded.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Assuming binary classification, adjust for multiclass
])

ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

ann_model.fit(X_train_encoded, (y_train_encoded > y_train_encoded.median()).astype(int), epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x796f2a6950f0>

EVALUATION OF 3 MODELS

In [None]:

# Evaluate Logistic Regression
logistic_predictions = logistic_model.predict(X_test_encoded)

# Evaluate K-NN
knn_predictions = knn_model.predict(X_test_encoded)

# Evaluate ANN
ann_predictions = (ann_model.predict(X_test_encoded) > 0.5).astype(int)




ACCURACY, CONFUSION MATRIX, CLASSIFICATION REPORT OF 3 MODELS

In [None]:
# Print metrics
print("Logistic Regression Accuracy:", accuracy_score(y_test_encoded, logistic_predictions))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test_encoded, logistic_predictions))
print("Logistic Regression Classification Report:\n", classification_report(y_test_encoded, logistic_predictions))

print("\nK-NN Accuracy:", accuracy_score(y_test_encoded, knn_predictions))
print("K-NN Confusion Matrix:\n", confusion_matrix(y_test_encoded, knn_predictions))
print("K-NN Classification Report:\n", classification_report(y_test_encoded, knn_predictions))

print("\nANN Accuracy:", accuracy_score(y_test_encoded, ann_predictions))
print("ANN Confusion Matrix:\n", confusion_matrix(y_test_encoded, ann_predictions))
print("ANN Classification Report:\n", classification_report(y_test_encoded, ann_predictions))

Logistic Regression Accuracy: 0.8223495702005731
Logistic Regression Confusion Matrix:
 [[858  62]
 [186 290]]
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.93      0.87       920
           1       0.82      0.61      0.70       476

    accuracy                           0.82      1396
   macro avg       0.82      0.77      0.79      1396
weighted avg       0.82      0.82      0.81      1396


K-NN Accuracy: 0.8094555873925502
K-NN Confusion Matrix:
 [[813 107]
 [159 317]]
K-NN Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86       920
           1       0.75      0.67      0.70       476

    accuracy                           0.81      1396
   macro avg       0.79      0.77      0.78      1396
weighted avg       0.81      0.81      0.81      1396


ANN Accuracy: 0.8631805157593123
ANN Confusion Matrix:
 [[862  58]
 [133 343]]


# **Model Comparison**


In [None]:

# Based on the provided results, let's analyze the performance of the three models:

# Logistic Regression:
# Accuracy: 82.5%
# Strengths:
# Achieved a good balance between precision and recall for both classes.
# Provides interpretable coefficients, allowing for feature importance analysis.
# Weaknesses:
# F1-score for class 1 (Leave) is relatively lower compared to the other models, indicating a potential challenge in correctly identifying instances of class 1.

# K-Nearest Neighbors (K-NN):
# Accuracy: 80.5%
# Strengths:
# High recall for class 0 (Not Leave), indicating good performance in identifying instances of class 0.
# Simple and easy to understand.
# Weaknesses:
# Precision and F1-score for class 1 (Leave) are lower compared to the other models, indicating challenges in correctly identifying instances of class 1.

# Artificial Neural Network (ANN):
# Accuracy: 86.6%
# Strengths:
# Highest accuracy among the three models.
# Achieved a good balance between precision and recall for both classes.
# Capable of capturing complex patterns in the data.
# Weaknesses:
# The model may be seen as a "black box," making it challenging to interpret the importance of individual features.

# Overall Comparison:
# The ANN model outperformed both Logistic Regression and K-NN in terms of accuracy.
# Logistic Regression showed competitive performance but struggled with F1-score for class 1.
# K-NN had challenges in precision and F1-score for class 1, indicating potential misclassifications.

# Considerations:
# The choice of the best model depends on the specific goals and requirements of the problem.
# Logistic Regression might be preferred when interpretability and feature importance analysis are crucial.
# K-NN could be a simple and interpretable model for certain scenarios.
# ANN, despite its "black box" nature, can be a powerful tool for capturing complex relationships in the data.

# Model Selection Techniques:

LOGISTIC REGRESSION CROSS-VALIDATION

In [None]:
#Implement K-fold cross-validation (e.g., 10-fold cross-validation) to assess the models' performance:

from sklearn.model_selection import cross_val_score

# Perform cross-validation for Logistic Regression
logistic_cross_val_scores = cross_val_score(logistic_model, X_train_encoded, y_train_encoded, cv=10)

# Output the cross-validation scores
print("Logistic Regression Cross-Validation Scores:", logistic_cross_val_scores)

Logistic Regression Cross-Validation Scores: [0.79141104 0.8006135  0.8006135  0.79754601 0.8006135  0.7791411
 0.8190184  0.82769231 0.8        0.8       ]


KNN CROSS-VALIDATION

In [None]:
from sklearn.model_selection import cross_val_score
# Perform cross-validation for K-NN
knn_cross_val_scores = cross_val_score(knn_model, X_train_encoded, y_train_encoded, cv=10)
print("K-NN Cross-Validation Scores:", knn_cross_val_scores)

K-NN Cross-Validation Scores: [0.7607362  0.7607362  0.7607362  0.80368098 0.7791411  0.75766871
 0.8190184  0.79384615 0.79384615 0.77846154]


ANN CROSSVALIDATION

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Assuming you have X_train_encoded and y_train_encoded as your training features and labels

# Define the ANN model
ann_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

# Create a pipeline with StandardScaler and the ANN model
ann_pipeline = make_pipeline(StandardScaler(), ann_model)

# Perform cross-validation for the ANN
ann_cross_val_scores = cross_val_score(ann_pipeline, X_train_encoded, y_train_encoded, cv=10)

# Output the cross-validation scores
print("ANN Cross-Validation Scores:", ann_cross_val_scores)
print("Mean Accuracy:", ann_cross_val_scores.mean())


ANN Cross-Validation Scores: [0.77607362 0.80368098 0.78220859 0.78834356 0.79141104 0.79447853
 0.81288344 0.81538462 0.77846154 0.80923077]
Mean Accuracy: 0.7952156677678149


# Conclusion

In [None]:

# Model Performance:

# Among the three models (Logistic Regression, K-NN, and ANN), the Artificial Neural Network (ANN) demonstrated the highest accuracy on both the test set and during cross-validation.
# ANN achieved a good balance between precision and recall for both classes, making it a robust model for binary classification in this context.
# Strengths and Weaknesses:

# 1. Logistic Regression:

# Strengths: Interpretable coefficients, good balance between precision and recall for class 0.
# Weaknesses: F1-score for class 1 is relatively lower.

# 2. K-Nearest Neighbors (K-NN):

# Strengths: Simple and easy to understand, high recall for class 0.
# Weaknesses: Lower precision and F1-score for class 1.

# 3. Artificial Neural Network (ANN):

# Strengths: Highest accuracy, good balance between precision and recall for both classes, captures complex patterns.
# Weaknesses: Interpretability can be a challenge.

# Real-world Applications:

# 1. HR and Workforce Management:
# Employee Attrition Prediction:
# Predicting whether an employee is likely to leave can help HR departments proactively address retention strategies.
# Identify key factors influencing attrition and take preventive measures.

# 2. Talent Acquisition:
# Predictive models can assist in identifying potential candidates likely to succeed in a given role.
# Streamline recruitment processes and focus on candidates with a higher probability of long-term success.

# 3. Workforce Planning:
# Forecasting future workforce needs based on historical data can aid in strategic workforce planning.
# Optimize staffing levels, training programs, and skill development initiatives.

# 4. Employee Performance Prediction:
# Predicting employee performance can inform decisions related to promotions, training, and performance improvement plans.
# Tailor development programs to individual needs, improving overall team productivity.

# Recommendations:

# 1. Further Model Tuning:
# Fine-tune hyperparameters for each model, especially K-NN, to potentially improve performance.
# 2. Feature Importance Analysis:
# Conduct a detailed analysis of feature importance, especially using Logistic Regression, to identify key factors influencing predictions.
# 3. Interpretability vs. Complexity:
# Consider the trade-off between model interpretability and complexity when choosing the final model for deployment.