# **Predicting Employee Attrition Using Machine Learning**

## **1. Import Necessary Libraries**

In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


## **2. Load Dataset**

In [2]:

# Load Dataset
df = pd.read_csv('/content/WA_Fn-UseC_-HR-Employee-Attrition.csv.xls')
df.head()


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


## **3. Data Preprocessing**
### **3.1 Encode Categorical Variables**

In [3]:

# Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


## **4. Feature Engineering**

In [4]:

# Feature Engineering: Create two new features
df['TotalWorkingYearsPerJob'] = df['TotalWorkingYears'] / (df['NumCompaniesWorked'] + 1)
df['SatisfactionLevel'] = (df['JobSatisfaction'] + df['EnvironmentSatisfaction']) / 2


## **5. Define Features and Target Variable**

In [5]:

# Define features and target
X = df.drop(columns=['Attrition'])
y = df['Attrition']


## **6. Train-Test Split**

In [6]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## **7. Standardization**

In [7]:

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


## **8. Model Selection and Hyperparameter Tuning**

In [10]:
models = {
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression()
}

param_grid = {
    "RandomForest": {"n_estimators": [50, 100, 150], "max_depth": [5, 10, 15], "min_samples_split": [2, 5, 10]},
    "LogisticRegression": {"C": [0.1, 1, 10], "solver": ['liblinear', 'lbfgs', 'saga'], "max_iter": [100, 200, 300]}
}

best_models = {}
model_performance = {}

for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

    # Evaluate Model
    y_pred = best_models[model_name].predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    model_performance[model_name] = acc
    print(f"Classification Report for {model_name}:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {acc}\n")

Best parameters for RandomForest: {'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 50}
Classification Report for RandomForest:
              precision    recall  f1-score   support

           0       0.85      0.98      0.91       247
           1       0.50      0.09      0.15        47

    accuracy                           0.84       294
   macro avg       0.67      0.53      0.53       294
weighted avg       0.79      0.84      0.79       294

Accuracy: 0.8401360544217688





Best parameters for LogisticRegression: {'C': 0.1, 'max_iter': 100, 'solver': 'lbfgs'}
Classification Report for LogisticRegression:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       247
           1       0.75      0.32      0.45        47

    accuracy                           0.87       294
   macro avg       0.82      0.65      0.69       294
weighted avg       0.86      0.87      0.85       294

Accuracy: 0.8741496598639455



## **9. Conclusion and Recommendation**

In [11]:

# Conclusion: Model Comparison and Recommendation
best_model_name = max(model_performance, key=model_performance.get)
print(f"Best performing model: {best_model_name} with accuracy {model_performance[best_model_name]:.4f}")

# Recommendation
if best_model_name == "RandomForest":
    print("Recommendation: Use Random Forest for better predictive power and feature importance analysis.")
else:
    print("Recommendation: Use Logistic Regression for better interpretability.")


Best performing model: LogisticRegression with accuracy 0.8741
Recommendation: Use Logistic Regression for better interpretability.
