In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_score, recall_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
from sklearn.cluster import KMeans

from imblearn.over_sampling import SMOTE


In [27]:
def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

OUTDIR = "D:/Microsoft AI Engineer Program/Machine Learning using Python/Assessment/outputs"
os.makedirs(OUTDIR, exist_ok=True)
ensure_dir(OUTDIR)

def save_fig(fig, name):
    path = os.path.join(OUTDIR, name)
    fig.savefig(path, bbox_inches='tight', dpi=120)
    print(f"Saved: {path}")

In [32]:
# ---------- 1. Load data & data quality checks ----------
def load_and_qc(path="D:/Microsoft AI Engineer Program/Machine Learning using Python/HR_comma_sep.csv"):
    df = pd.read_csv(path)
    print("Loaded dataset shape:", df.shape)
    print("\n--- Missing values ---")
    print(df.isnull().sum())
    print("\n--- Data types ---")
    print(df.dtypes)
    print("\n--- Value counts for target 'left' ---")
    print(df['left'].value_counts())
    return df

# ---------- 2. Exploratory Data Analysis ----------
def eda_plots(df):
    # correlation heatmap for numeric features
    numeric = df.select_dtypes(include=np.number)
    corr = numeric.corr()
    fig, ax = plt.subplots(figsize=(8,6))
    cax = ax.imshow(corr, interpolation='nearest', aspect='auto')
    ax.set_xticks(range(len(corr.columns))); ax.set_yticks(range(len(corr.index)))
    ax.set_xticklabels(corr.columns, rotation=45, ha='right'); ax.set_yticklabels(corr.index)
    fig.colorbar(cax, ax=ax)
    ax.set_title("Correlation matrix (numeric)")
    save_fig(fig, "corr_matrix.png")
    plt.close(fig)

    # Histograms
    for col in ['satisfaction_level','last_evaluation','average_montly_hours']:
        fig, ax = plt.subplots(figsize=(5,3))
        ax.hist(df[col], bins=30)
        ax.set_title(col)
        save_fig(fig, f"hist_{col}.png")
        plt.close(fig)

    # Bar: number_project vs left
    proj_counts = df.groupby(['number_project','left']).size().unstack(fill_value=0)
    fig, ax = plt.subplots(figsize=(8,4))
    x = np.arange(len(proj_counts.index))
    width = 0.35
    ax.bar(x - width/2, proj_counts[0].values, width=width, label='Stayed (0)')
    ax.bar(x + width/2, proj_counts[1].values, width=width, label='Left (1)')
    ax.set_xticks(x); ax.set_xticklabels(proj_counts.index)
    ax.set_xlabel("Number of projects"); ax.set_ylabel("Count"); ax.legend()
    save_fig(fig, "proj_count_left_vs_stay.png")
    plt.close(fig)


In [18]:
# ---------- 3. Which factors contribute most to turnover ----------
def feature_importance(df, random_state=42):
    # One-hot encode categorical columns: 'sales' (department) and 'salary'
    df_enc = pd.get_dummies(df, columns=['sales','salary'], drop_first=True)
    X = df_enc.drop('left', axis=1)
    y = df_enc['left']

    # Train/test split (stratified)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2, stratify=y,
                                                        random_state=random_state)

    # Apply SMOTE on training for fair feature importance (train balanced)
    sm = SMOTE(random_state=random_state)
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # Scale numeric features only (after resampling)
    scaler = StandardScaler()
    X_train_res_s = scaler.fit_transform(X_train_res)
    X_test_s = scaler.transform(X_test)

    # RandomForest for feature importance
    rf = RandomForestClassifier(n_estimators=200, random_state=random_state, n_jobs=-1)
    rf.fit(X_train_res_s, y_train_res)

    # Permutation importance (robust)
    r = permutation_importance(rf, X_train_res_s, y_train_res, n_repeats=25, random_state=random_state, n_jobs=-1)
    features = X.columns
    imp_df = pd.DataFrame({"feature": features, "importance_mean": r.importances_mean, "importance_std": r.importances_std})
    imp_df = imp_df.sort_values("importance_mean", ascending=False).reset_index(drop=True)
    imp_path = os.path.join(OUTDIR, "feature_importances.csv")
    imp_df.to_csv(imp_path, index=False)
    print(f"Saved feature importances to {imp_path}")

    # Plot top 12
    top = imp_df.head(12)[::-1]
    fig, ax = plt.subplots(figsize=(7,5))
    ax.barh(top['feature'], top['importance_mean'], xerr=top['importance_std'])
    ax.set_title("Top feature importances (permutation - RandomForest on SMOTE train)")
    save_fig(fig, "feature_importances.png")
    plt.close(fig)

    # Also show logistic regression coefficients for sign direction
    lr = LogisticRegression(max_iter=2000, random_state=random_state)
    lr.fit(X_train_res_s, y_train_res)
    coefs = pd.Series(lr.coef_[0], index=features).sort_values(key=abs, ascending=False)
    coef_df = pd.DataFrame({"feature": coefs.index, "coef": coefs.values})
    coef_df.to_csv(os.path.join(OUTDIR, "logistic_coefs.csv"), index=False)
    print(f"Saved logistic coefficients to {os.path.join(OUTDIR, 'logistic_coefs.csv')}")

    return imp_df, coef_df, (X_train_res_s, y_train_res, X_test_s, y_test, scaler, X.columns)


In [19]:
# ---------- 4. Clustering of employees who left (satisfaction & evaluation) ----------
def cluster_left(df, n_clusters=3, random_state=42):
    left_df = df[df['left']==1][['satisfaction_level','last_evaluation']].dropna()
    if left_df.shape[0] == 0:
        print("No rows where left==1 to cluster.")
        return None
    scaler = StandardScaler()
    Xc = scaler.fit_transform(left_df)
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    labels = kmeans.fit_predict(Xc)
    centers = scaler.inverse_transform(kmeans.cluster_centers_)

    # Save scatter plot
    fig, ax = plt.subplots(figsize=(6,5))
    for c in range(n_clusters):
        sel = left_df[labels==c]
        ax.scatter(sel['satisfaction_level'], sel['last_evaluation'], s=18, alpha=0.6, label=f'Cluster {c}')
    ax.scatter(centers[:,0], centers[:,1], marker='X', s=120, edgecolor='k', label='Centers')
    ax.set_xlabel('satisfaction_level'); ax.set_ylabel('last_evaluation'); ax.legend()
    ax.set_title('KMeans clusters (employees who left)')
    save_fig(fig, "kmeans_left_clusters.png")
    plt.close(fig)

    center_df = pd.DataFrame(centers, columns=['satisfaction_level','last_evaluation'])
    center_df.to_csv(os.path.join(OUTDIR, "kmeans_centers.csv"), index_label='cluster')
    print(f"Saved cluster centers to {os.path.join(OUTDIR, 'kmeans_centers.csv')}")
    return labels, center_df

In [20]:
# ---------- 5. Handle class imbalance using SMOTE ----------
def apply_smote(X_train, y_train, random_state=42):
    sm = SMOTE(random_state=random_state)
    X_res, y_res = sm.fit_resample(X_train, y_train)
    print("After SMOTE, class distribution:", np.bincount(y_res))
    return X_res, y_res

In [21]:
# ---------- 6. k-fold cross-validation training and evaluation ----------
def train_and_evaluate_models(X_train_res_s, y_train_res, X_test_s, y_test, feature_names, random_state=42):
    models = {
        "LogisticRegression": LogisticRegression(max_iter=2000, random_state=random_state),
        "RandomForest": RandomForestClassifier(n_estimators=200, random_state=random_state, n_jobs=-1),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=200, random_state=random_state)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    summary = []

    for name, model in models.items():
        # Cross-validated predictions on the SMOTE-upsampled train
        print(f"\n--- CV (train, upsampled) for {name} ---")
        y_cv_pred = cross_val_predict(model, X_train_res_s, y_train_res, cv=cv, n_jobs=-1)
        print(classification_report(y_train_res, y_cv_pred, digits=4))
        prec_cv = precision_score(y_train_res, y_cv_pred)
        rec_cv = recall_score(y_train_res, y_cv_pred)
        acc_cv = accuracy_score(y_train_res, y_cv_pred)
        # fit on full upsampled train
        model.fit(X_train_res_s, y_train_res)
        # evaluate on test set
        y_test_pred = model.predict(X_test_s)
        y_test_prob = model.predict_proba(X_test_s)[:,1] if hasattr(model, "predict_proba") else model.decision_function(X_test_s)
        roc_auc = roc_auc_score(y_test, y_test_prob)
        prec_test = precision_score(y_test, y_test_pred)
        rec_test = recall_score(y_test, y_test_pred)
        acc_test = accuracy_score(y_test, y_test_pred)
        print(f"Test set: accuracy={acc_test:.4f} precision={prec_test:.4f} recall={rec_test:.4f} ROC_AUC={roc_auc:.4f}")
        print("Confusion matrix (test):\n", confusion_matrix(y_test, y_test_pred))
        # store summary
        summary.append({
            "model": name,
            "cv_precision": prec_cv, "cv_recall": rec_cv, "cv_accuracy": acc_cv,
            "test_precision": prec_test, "test_recall": rec_test, "test_accuracy": acc_test, "test_roc_auc": roc_auc,
            "model_obj": model
        })

        # Save ROC curve figure
        fpr, tpr, _ = roc_curve(y_test, y_test_prob)
        fig, ax = plt.subplots(figsize=(6,5))
        ax.plot(fpr, tpr, label=f"{name} (AUC={roc_auc:.3f})")
        ax.plot([0,1],[0,1],'--', linewidth=0.7)
        ax.set_xlabel("False Positive Rate"); ax.set_ylabel("True Positive Rate (Recall)")
        ax.set_title(f"ROC curve - {name}")
        ax.legend()
        save_fig(fig, f"roc_{name}.png")
        plt.close(fig)

    # Create summary dataframe and save
    summary_df = pd.DataFrame(summary).drop(columns=['model_obj'])
    summary_df.to_csv(os.path.join(OUTDIR, "model_summary.csv"), index=False)
    print(f"\nSaved model summary to {os.path.join(OUTDIR, 'model_summary.csv')}")
    return summary, models


In [None]:
# ---------- 7. Pick best model and produce risk zones CSV ----------
def predict_and_save_risk_zones(best_model, scaler, X_test_original, X_test_s, 
                                output_csv=os.path.join(OUTDIR, "hr_test_probs_with_zones.csv")):
    # best_model should be fitted already
    if not hasattr(best_model, "predict_proba"):
        raise ValueError("Best model must support predict_proba for probability-based risk zones.")

    probs = best_model.predict_proba(X_test_s)[:, 1]

    # Create risk zones
    zones = pd.cut(
        probs,
        bins=[-0.001, 0.20, 0.60, 0.90, 1.001],
        labels=["Safe (Green)", "Low-Risk (Yellow)", "Medium-Risk (Orange)", "High-Risk (Red)"]
    )

    out_df = X_test_original.copy().reset_index(drop=True)
    out_df["prob_leave"] = probs
    out_df["risk_zone"] = zones 

    out_df.to_csv(output_csv, index=False)
    print(f"Saved test set probabilities & zones to {output_csv}")

    # Print counts per risk zone
    print(out_df["risk_zone"].value_counts().reindex(
        ["Safe (Green)", "Low-Risk (Yellow)", "Medium-Risk (Orange)", "High-Risk (Red)"]
    ))

    return out_df



In [24]:
# ---------- 8. Retention strategy suggestions (as function that returns actions) ----------
def retention_strategies_for_zone(zone_label):
    strategies = {
        "Safe (Green)": [
            "Maintain engagement and recognition programs.",
            "Monitor periodically; no immediate action required."
        ],
        "Low-Risk (Yellow)": [
            "Manager 1:1 check-ins to discuss satisfaction and workload.",
            "Mentorship or small incentives; review career path opportunities."
        ],
        "Medium-Risk (Orange)": [
            "Proactive interventions: tailored development plan, compensation review.",
            "Consider role re-assignment or flexible arrangements."
        ],
        "High-Risk (Red)": [
            "Immediate outreach: manager + HR retention interview.",
            "Consider targeted counter-offers, rapid actions to resolve grievances, or job redesign."
        ]
    }
    return strategies.get(zone_label, ["No strategy found."])


In [35]:
# ---------- Main pipeline ----------
def main():
    # 1. Load and QC
    df = load_and_qc()

    # 2. EDA plots
    eda_plots(df)

    # 3. Feature importance (returns data needed for modeling)
    imp_df, coef_df, preprocess_info = feature_importance(df)
    X_train_res_s, y_train_res, X_test_s, y_test, scaler, feature_names = preprocess_info

    # 4. Clustering employees who left
    cluster_left(df, n_clusters=3)

    # 5. (SMOTE already applied above in feature_importance) If you want to run again:
    # Example: X_res, y_res = apply_smote(X_train, y_train)

    # 6. Train & evaluate models (CV + test)
    summary, models = train_and_evaluate_models(X_train_res_s, y_train_res, X_test_s, y_test, feature_names)

    # 7. Choose best model: prioritize recall on test set.
    # summary is a list of dicts; pick model with highest test_recall. Tie-break on test_roc_auc.
    summary_df = pd.DataFrame(summary)
    best_idx = summary_df['test_recall'].idxmax()
    # tie-breaker: if multiple same recall, pick highest ROC AUC
    best_candidates = summary_df[summary_df['test_recall'] == summary_df.loc[best_idx,'test_recall']]
    if len(best_candidates) > 1:
        best_row = best_candidates.loc[best_candidates['test_roc_auc'].idxmax()]
    else:
        best_row = summary_df.loc[best_idx]
    best_model_name = best_row['model']
    best_model = models[best_model_name]
    print(f"\nBest model chosen (recall-first): {best_model_name}")
    print(best_row[['test_precision','test_recall','test_accuracy','test_roc_auc']])

    # 8. For saving risk zones we need the original X_test (unscaled, with same columns used earlier)
    # Reconstruct dataset used earlier for 'feature_importance' step to obtain X_test_original:
    df_enc = pd.get_dummies(df, columns=['sales','salary'], drop_first=True)
    X_all = df_enc.drop('left', axis=1)
    y_all = df_enc['left']
    # Resplit to match previous train/test split (random_state = 42 was used earlier inside feature_importance split)
    # Note: feature_importance used random_state=42 (here too). Use same random_state to align test indices.
    X_train, X_test_original, y_train, y_test_original = train_test_split(X_all, y_all, test_size=0.2, stratify=y_all, random_state=42)
    # Scale X_test_original using the scaler returned earlier
    X_test_original_s = scaler.transform(X_test_original)

    # Ensure best_model is fitted; if not, fit it on the resampled + scaled training set
    if not hasattr(best_model, "classes_"):
        best_model.fit(X_train_res_s, y_train_res)

    # Save predictions & zones CSV
    predict_and_save_risk_zones(best_model, scaler, X_test_original, X_test_original_s, output_csv=os.path.join(OUTDIR, "hr_test_probs_with_zones.csv"))

    # 9. Print retention strategies sample
    print("\nRetention strategy examples by zone:")
    for z in ["Safe (Green)","Low-Risk (Yellow)","Medium-Risk (Orange)","High-Risk (Red)"]:
        print("\n", z)
        for s in retention_strategies_for_zone(z):
            print(" -", s)

    print("\nAll outputs saved to:", OUTDIR)
    print("Files: feature_importances.png, logistic_coefs.csv, model_summary.csv, hr_test_probs_with_zones.csv, kmeans_centers.csv, etc.")

if __name__ == "__main__":
    main()

Loaded dataset shape: (14999, 10)

--- Missing values ---
satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
sales                    0
salary                   0
dtype: int64

--- Data types ---
satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
sales                     object
salary                    object
dtype: object

--- Value counts for target 'left' ---
left
0    11428
1     3571
Name: count, dtype: int64
Saved: D:/Microsoft AI Engineer Program/Machine Learning using Python/Assessment/outputs\corr_matrix.png
Saved: D:/Microsoft AI Engineer Program/Machine Learning using Python/Assessment/outputs\hist_sat