# IBM HR Analytics – Employee Attrition & Performance
*Generated on 2025-10-07 10:32 (EDA + ML + Advanced)*


In [None]:
import os, glob, warnings
warnings.filterwarnings('ignore')
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, RocCurveDisplay)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import sys, sklearn
print('Python:', sys.version.split()[0])
print('scikit-learn:', sklearn.__version__)
print('pandas:', pd.__version__)

## Load dataset

In [None]:
search_patterns = [
    '/mnt/data/WA_Fn-UseC_-HR-Employee-Attrition*.csv',
    '/mnt/data/HR-Employee-Attrition*.csv'
]
csv_path = None
for pat in search_patterns:
    m = glob.glob(pat)
    if m:
        csv_path = m[0]; break
if not csv_path:
    raise FileNotFoundError('CSV not found in /mnt/data. Make sure the dataset is uploaded.')
df = pd.read_csv(csv_path)
print('Loaded:', os.path.basename(csv_path), 'shape=', df.shape)

## Quick peek & quality

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all').T.head(20)

In [None]:
print('Duplicates:', df.duplicated().sum())
missing = df.isna().mean().sort_values(ascending=False)
missing[missing>0] if missing.sum()>0 else print('No missing values')

## Ordinal label mappings

In [None]:
education_map = {1:'Below College',2:'College',3:'Bachelor',4:'Master',5:'Doctor'}
satisfaction_map = {1:'Low',2:'Medium',3:'High',4:'Very High'}
performance_map = {1:'Low',2:'Good',3:'Excellent',4:'Outstanding'}
worklife_map = {1:'Bad',2:'Good',3:'Better',4:'Best'}
df['Education_Lbl'] = df['Education'].map(education_map)
for col in ['EnvironmentSatisfaction','JobInvolvement','JobSatisfaction','RelationshipSatisfaction']:
    df[f'{col}_Lbl'] = df[col].map(satisfaction_map)
df['PerformanceRating_Lbl'] = df['PerformanceRating'].map(performance_map)
df['WorkLifeBalance_Lbl'] = df['WorkLifeBalance'].map(worklife_map)
df['AttritionFlag'] = (df['Attrition'].astype(str).str.strip().str.lower()=='yes').astype(int)
df.filter(regex='(_Lbl|Attrition|AttritionFlag)$').head()

## Helper: Attrition rate by any feature

In [None]:
def attrition_rate_by(df, col):
    temp = df.groupby(col)['AttritionFlag'].agg(['mean','count']).rename(columns={'mean':'AttritionRate','count':'Count'})
    temp['AttritionRate'] = (temp['AttritionRate']*100).round(2)
    return temp.sort_values('AttritionRate', ascending=False)

## Overall attrition

In [None]:
overall_attrition = df['AttritionFlag'].mean()*100
print(f'Attrition rate: {overall_attrition:.2f}%')

In [None]:
vals = df['Attrition'].value_counts()
plt.figure(figsize=(5,4)); plt.bar(vals.index, vals.values)
plt.title('Attrition Distribution'); plt.xlabel('Attrition'); plt.ylabel('Count'); plt.tight_layout(); plt.show()

## Tenure & demographics

In [None]:
print('Average tenure (YearsAtCompany):', round(df['YearsAtCompany'].mean(),2))

In [None]:
plt.figure(figsize=(6,4)); plt.hist(df['YearsAtCompany'], bins=20)
plt.title('Distribution: YearsAtCompany'); plt.xlabel('Years'); plt.ylabel('Employees'); plt.tight_layout(); plt.show()

In [None]:
plt.figure(figsize=(6,4)); plt.hist(df['Age'], bins=20)
plt.title('Distribution: Age'); plt.xlabel('Age'); plt.ylabel('Employees'); plt.tight_layout(); plt.show()

In [None]:
plt.figure(figsize=(6,4)); g = df['Gender'].value_counts(); plt.bar(g.index, g.values)
plt.title('Distribution: Gender'); plt.xlabel('Gender'); plt.ylabel('Employees'); plt.tight_layout(); plt.show()

In [None]:
plt.figure(figsize=(7,4)); d = df['Department'].value_counts(); plt.bar(d.index, d.values)
plt.title('Distribution: Department'); plt.xlabel('Department'); plt.ylabel('Employees'); plt.xticks(rotation=15); plt.tight_layout(); plt.show()

## Attrition by Age

In [None]:
bins = np.arange(df['Age'].min(), df['Age'].max()+2, 2)
plt.figure(figsize=(7,4))
plt.hist(df.loc[df['AttritionFlag']==0,'Age'], bins=bins, alpha=0.6, label='No')
plt.hist(df.loc[df['AttritionFlag']==1,'Age'], bins=bins, alpha=0.6, label='Yes')
plt.title('Age distribution by Attrition'); plt.xlabel('Age'); plt.ylabel('Employees'); plt.legend(); plt.tight_layout(); plt.show()

## Attrition rate by key features

In [None]:
for col in ['Gender','Department','BusinessTravel','JobRole','Education_Lbl','JobSatisfaction_Lbl','WorkLifeBalance_Lbl','StockOptionLevel']:
    rates = attrition_rate_by(df, col)
    print(f'\n=== {col} ==='); display(rates.head(10))
    plt.figure(figsize=(7,4)); x = rates.index.astype(str)[:10]; y = rates['AttritionRate'].values[:10]
    plt.bar(x,y); plt.title(f'Attrition Rate by {col} (Top 10)'); plt.ylabel('Attrition Rate (%)')
    plt.xticks(rotation=20, ha='right'); plt.tight_layout(); plt.show()

## MonthlyIncome vs Attrition

In [None]:
vals = [df.loc[df['AttritionFlag']==0,'MonthlyIncome'], df.loc[df['AttritionFlag']==1,'MonthlyIncome']]
plt.figure(figsize=(7,4)); plt.boxplot(vals, labels=['No','Yes']); plt.title('MonthlyIncome by Attrition')
plt.ylabel('MonthlyIncome'); plt.tight_layout(); plt.show()

## Correlation (numeric features)

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
corr = df[num_cols].corr()
plt.figure(figsize=(10,8)); im = plt.imshow(corr, aspect='auto'); plt.colorbar(im, fraction=0.046, pad=0.04)
plt.xticks(range(len(num_cols)), num_cols, rotation=90); plt.yticks(range(len(num_cols)), num_cols)
plt.title('Correlation matrix (numeric)'); plt.tight_layout(); plt.show()

# Baseline ML Models
## Train/Test split & preprocessing

In [None]:
target = 'AttritionFlag'
X = df.drop(columns=[target]); y = df[target]
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
cat_cols = list(sorted(set(cat_cols + [c for c in X.columns if c.endswith('_Lbl')] )))
cat_cols = [c for c in cat_cols if c in X.columns]
num_cols = [c for c in X.columns if c not in cat_cols and X[c].dtype != 'O']
redundant = [c for c in cat_cols if c.replace('_Lbl','') in num_cols]
cat_cols = [c for c in cat_cols if c not in redundant]
X = X[cat_cols + num_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train/Test:', X_train.shape, X_test.shape)

## Pipelines: Logistic Regression & Random Forest

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
numeric_transformer = StandardScaler()

preprocess = ColumnTransformer([('cat', categorical_transformer, cat_cols),
                                ('num', numeric_transformer, num_cols)], remainder='drop')

log_reg = Pipeline([('prep', preprocess), ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))])
rf = Pipeline([('prep', preprocess), ('clf', RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced_subsample'))])

for name, model in [('LogisticRegression', log_reg), ('RandomForest', rf)]:
    print('\nFitting:', name)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    proba = model.predict_proba(X_test)[:,1] if hasattr(model, 'predict_proba') else None
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
    acc = accuracy_score(y_test, pred); prec = precision_score(y_test, pred, zero_division=0)
    rec = recall_score(y_test, pred, zero_division=0); f1 = f1_score(y_test, pred, zero_division=0)
    roc = roc_auc_score(y_test, proba) if proba is not None else np.nan
    print(f'[{name}] Acc={acc:.3f}  Prec={prec:.3f}  Rec={rec:.3f}  F1={f1:.3f}  ROC-AUC={roc:.3f}')
    print('Confusion Matrix:\n', confusion_matrix(y_test, pred))

## ROC Curve (Random Forest)

In [None]:
RocCurveDisplay.from_estimator(rf, X_test, y_test)
plt.title('ROC Curve – Random Forest'); plt.tight_layout(); plt.show()

## Feature Importances (Random Forest)

In [None]:
ohe = rf.named_steps['prep'].named_transformers_['cat']
feature_names = list(ohe.get_feature_names_out(cat_cols)) + num_cols
importances = rf.named_steps['clf'].feature_importances_
fi = (pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False).reset_index(drop=True))
fi.head(20)

In [None]:
plt.figure(figsize=(8,6)); topn=20
plt.barh(fi['feature'][:topn][::-1], fi['importance'][:topn][::-1])
plt.title('Top Feature Importances (RF)'); plt.xlabel('Importance'); plt.tight_layout(); plt.show()

## Cross‑validation (StratifiedKFold, ROC‑AUC)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X, y, cv=cv, scoring='roc_auc')
print('RandomForest ROC-AUC (5-fold):', scores.round(3), '| mean=', scores.mean().round(3))

## Hyperparameter Tuning – Random Forest (RandomizedSearchCV)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

param_dist = {
    'clf__n_estimators': randint(200, 600),
    'clf__max_depth': randint(3, 20),
    'clf__min_samples_split': randint(2, 20),
    'clf__min_samples_leaf': randint(1, 10),
    'clf__max_features': ['sqrt', 'log2', None],
    'clf__bootstrap': [True, False],
}
rf_search = Pipeline([('prep', preprocess),
                      ('clf', RandomForestClassifier(random_state=42, class_weight='balanced_subsample'))])
search = RandomizedSearchCV(rf_search, param_distributions=param_dist, n_iter=25, cv=5,
                            scoring='roc_auc', random_state=42, verbose=1)
search.fit(X_train, y_train)
best_rf = search.best_estimator_
print('Best ROC-AUC (CV):', search.best_score_)
print('Best Params:', search.best_params_)
pred = best_rf.predict(X_test)
proba = best_rf.predict_proba(X_test)[:,1]
print(classification_report(y_test, pred))
ConfusionMatrixDisplay.from_predictions(y_test, pred); plt.title('Confusion Matrix – Tuned RF'); plt.tight_layout(); plt.show()

## Threshold Tuning (maximize F1)

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve, confusion_matrix, classification_report
proba = best_rf.predict_proba(X_test)[:,1]
prec, rec, th = precision_recall_curve(y_test, proba)
f1s = 2*prec*rec/(prec+rec + 1e-9)
ix = np.nanargmax(f1s)
best_thr = th[ix]
print(f'Best threshold≈ {best_thr:.3f}  F1={f1s[ix]:.3f}  P={prec[ix]:.3f}  R={rec[ix]:.3f}')
pred_thr = (proba >= best_thr).astype(int)
print('Confusion matrix@best-threshold:\n', confusion_matrix(y_test, pred_thr))
print(classification_report(y_test, pred_thr))

## Probability Calibration (Isotonic)

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
cal_rf = CalibratedClassifierCV(best_rf, method='isotonic', cv=5)
cal_rf.fit(X_train, y_train)
proba_cal = cal_rf.predict_proba(X_test)[:,1]
print('ROC-AUC (calibrated):', roc_auc_score(y_test, proba_cal))

## Save Trained Pipeline + Inference Helper

In [None]:
import joblib
artifact_path = '/mnt/data/hr_attrition_best_rf.joblib'
joblib.dump(best_rf, artifact_path)
print('Saved:', artifact_path)
def load_model(path=artifact_path):
    return joblib.load(path)
def predict_attrition(df_like, model_path=artifact_path, threshold=0.5):
    model = load_model(model_path)
    proba = model.predict_proba(df_like)[:,1]
    preds = (proba >= threshold).astype(int)
    return preds, proba

## Streamlit App Stub

In [None]:
app_code = '''import streamlit as st
import pandas as pd
import joblib

st.title("HR Attrition Predictor (Demo)")

@st.cache_resource
def load_model():
    return joblib.load("hr_attrition_best_rf.joblib")

model = load_model()

file = st.file_uploader("Upload CSV", type=["csv"])
thr = st.slider("Decision threshold", 0.0, 1.0, 0.5, 0.01)

if file is not None:
    df = pd.read_csv(file)
    proba = model.predict_proba(df)[:,1]
    preds = (proba >= thr).astype(int)
    out = df.copy()
    out["Attrition_Proba"] = proba
    out["Attrition_Pred"] = preds
    st.write(out.head())
    st.download_button("Download predictions", out.to_csv(index=False).encode("utf-8"), "predictions.csv", "text/csv")
'''
from pathlib import Path
Path('/mnt/data/app.py').write_text(app_code, encoding='utf-8')
print('Wrote /mnt/data/app.py')