# 👨‍💼 Employee Attrition Prediction

This notebook predicts whether an employee is likely to leave (Attrition = Yes) based on employee data. The project uses a Random Forest Classifier to make predictions based on features like job satisfaction, income, and department.

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
warnings.filterwarnings("ignore")

## 📂 Load Dataset

In [None]:
df = pd.read_csv("employee_data.csv")
df.head()

## 🧼 Data Cleaning

In [None]:

df.dropna(inplace=True)  # Drop rows with missing values
df.isnull().sum()  # Check if there are any remaining null values


## 🎯 Encode Target Variable

In [None]:
df['Attrition'].value_counts()  # Check class distribution
df['Attrition'] = df['Attrition'].map({"Yes": 1, "No": 0})  # Encode target variable (Attrition = 1, Retained = 0)

## 🧠 Feature Selection

In [None]:

X = df.drop(columns=["Attrition"])
y = df["Attrition"]

# Numeric features
numeric_features = ["Age", "Education", "JobSatisfaction", "MonthlyIncome", "TotalWorkingYears", "YearsAtCompany"]

# Categorical features
categorical_features = [col for col in X.columns if col not in numeric_features]


## 🔀 Train/Test Split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


## ⚙️ Preprocessing Setup

In [None]:

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])


## 🤖 Model Training

In [None]:

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline.fit(X_train, y_train)


## 📈 Model Evaluation

In [None]:

y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


## 💾 Save Model

In [None]:

# Save the trained model and feature list for future use
joblib.dump(pipeline, "attrition_model.pkl")
joblib.dump(X.columns.tolist(), "attrition_features.pkl")
