# 🎓 Capstone Project: Employee Attrition Prediction
Using Python, scikit-learn, and Streamlit

In [None]:
# 📌 Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pickle

In [None]:
# 📌 Load Dataset
df = pd.read_csv('small_hr_dataset.csv')
df.head()

In [None]:
# 📌 Encode Categorical Variables
categorical_cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'MaritalStatus', 'OverTime', 'Attrition']
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})
le = LabelEncoder()
for col in categorical_cols[:-1]:
    df[col] = le.fit_transform(df[col])
df.head()

In [None]:
# 📌 EDA
sns.countplot(x='Attrition', data=df)
plt.title('Attrition Count')
plt.show()
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
sns.boxplot(x='Attrition', y='MonthlyIncome', data=df)
plt.title('Monthly Income vs Attrition')
plt.show()

In [None]:
# 📌 Scale Numerical Features
num_cols = ['Age', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction',
            'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked',
            'PercentSalaryHike', 'PerformanceRating', 'TotalWorkingYears',
            'YearsAtCompany']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
# 📌 Train-Test Split
X = df.drop('Attrition', axis=1)
y = df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# 📌 Model Training
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
# 📌 Evaluation
print('Logistic Regression:
', classification_report(y_test, y_pred_lr))
print('Accuracy:', accuracy_score(y_test, y_pred_lr))
print('ROC-AUC:', roc_auc_score(y_test, lr.predict_proba(X_test)[:,1]))
print('
Random Forest:
', classification_report(y_test, y_pred_rf))
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print('ROC-AUC:', roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# 📌 Feature Importance
feat_imp = pd.Series(rf.feature_importances_, index=X.columns)
feat_imp.nlargest(10).plot(kind='barh')
plt.title('Top 10 Feature Importances - Random Forest')
plt.show()

In [None]:
# 📌 Save Model and Scaler
import pickle
with open('attrition_rf_model.pkl', 'wb') as f:
    pickle.dump(rf, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print('Model and scaler saved for Streamlit deployment.')