In [1]:
"""
04 Modeling
Train and evaluate models for employee attrition prediction.
"""

'\n04 Modeling\nTrain and evaluate models for employee attrition prediction.\n'

In [2]:
from src.modeling import setup_modeling, train_and_tune_model, evaluate_trained_model, plot_feature_importance, save_trained_model
import pandas as pd
from IPython.display import display, Markdown
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, precision_recall_curve
import numpy as np
from pycaret.classification import predict_model, plot_model, pull
import shutil
import shap
import matplotlib.pyplot as plt

In [3]:
display(Markdown("""
# Modeling
This notebook trains and evaluates machine learning models to predict employee attrition.
"""))


# Modeling
This notebook trains and evaluates machine learning models to predict employee attrition.


In [4]:
display(Markdown("""
## Load Engineered Features
We load the dataset with engineered features for modeling.
"""))


## Load Engineered Features
We load the dataset with engineered features for modeling.


In [5]:
import os

# Print current working directory
print("Current working directory:", os.getcwd())

# If not in the project root, change to it
project_root = r"C:\Users\USER\Documents\Projects\JJM-attrition-rate"
if os.getcwd() != project_root:
    os.chdir(project_root)
    print("Changed working directory to:", os.getcwd())

Current working directory: c:\Users\USER\Documents\Projects\JJM-attrition-rate\notebooks
Changed working directory to: C:\Users\USER\Documents\Projects\JJM-attrition-rate


In [6]:
# Load engineered features
features_df = pd.read_csv('data/employee_data_features.csv')
features_df.head()

Unnamed: 0,EmployeeId,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,YearsSinceLastPromotion,YearsWithCurrManager,AgeGroup,TenureRatio,OverallSatisfaction,SalaryToAgeRatio,SalaryToTenureRatio,PromotionRate,RoleStability,TravelImpact
0,2,37,1.0,Travel_Rarely,1141,Research & Development,11,2,Medical,1,...,0,0,36-45,0.066667,1.25,129.108108,4777.0,1.0,0.0,1
1,3,51,1.0,Travel_Rarely,1323,Research & Development,4,4,Life Sciences,1,...,2,7,46-55,0.555556,2.75,48.254902,246.1,5.0,0.0,1
2,4,42,0.0,Travel_Frequently,555,Sales,26,3,Marketing,1,...,4,8,36-45,0.869565,3.25,322.02381,676.25,5.0,0.2,2
3,7,40,0.0,Travel_Rarely,1124,Sales,1,2,Medical,1,...,0,2,36-45,0.666667,2.75,186.425,1864.25,4.0,0.75,1
4,8,55,1.0,Travel_Rarely,725,Research & Development,2,3,Medical,1,...,1,4,46-55,0.208333,3.0,361.072727,3971.8,5.0,0.4,1


In [7]:
display(Markdown("""
## Preprocess Categorical Columns
We preprocess categorical variables to ensure they are in a suitable format for modeling.
"""))


## Preprocess Categorical Columns
We preprocess categorical variables to ensure they are in a suitable format for modeling.


In [8]:
# Preprocess categorical columns for modeling (if needed)
categorical_cols = ['BusinessTravel', 'Department', 'EducationField', 
                   'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime', 'AgeGroup']
for col in categorical_cols:
    features_df[col] = features_df[col].astype(str).str.replace(' ', '_').str.replace('&', '_and_')

In [9]:
display(Markdown("""
## Setup Modeling Environment
We initialize the modeling environment, including data splitting and preprocessing.
"""))


## Setup Modeling Environment
We initialize the modeling environment, including data splitting and preprocessing.


In [10]:
# Setup modeling environment
setup_modeling(features_df)

[LightGBM] [Info] Number of positive: 615, number of negative: 615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8260
[LightGBM] [Info] Number of data points in the train set: 1230, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


<pycaret.classification.oop.ClassificationExperiment at 0x20711460690>

In [11]:
display(Markdown("""
## Train and Tune Model
We train and tune a machine learning model to optimize for recall (catching as many attrition cases as possible).
"""))


## Train and Tune Model
We train and tune a machine learning model to optimize for recall (catching as many attrition cases as possible).


In [12]:
# Train and tune model
model = train_and_tune_model(model_name='lda', optimize='Recall', n_iter=50)

In [13]:
display(Markdown("""
## Evaluate Model
We evaluate the trained model's performance using relevant metrics.
"""))


## Evaluate Model
We evaluate the trained model's performance using relevant metrics.


In [14]:
# Evaluate model
evaluate_trained_model(model)

# Save model
model_path = save_trained_model(model, 'models/final_lda_model')
# PyCaret appends .pkl if not present
if not os.path.exists('models/final_lda_model.pkl'):
    print("WARNING: Model file models/final_lda_model.pkl not found after saving.")
else:
    print("Model saved as models/final_lda_model.pkl")

# Export confusion matrix and classification report
try:
    if hasattr(model, 'predict'):
        y_true = features_df['Attrition'] if 'Attrition' in features_df.columns else None
        preds_df = predict_model(model, data=features_df)
        y_pred = preds_df['Label'] if 'Label' in preds_df.columns else None
        if y_true is not None and y_pred is not None:
            cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
            cr = classification_report(y_true, y_pred)
            os.makedirs('results', exist_ok=True)
            with open('results/confusion_matrix.md', 'w') as f:
                f.write('# Confusion Matrix\n')
                f.write(str(cm))
            with open('results/classification_report.md', 'w') as f:
                f.write('# Classification Report\n')
                f.write(cr)
            # ROC and PR curve data (if binary)
            if len(np.unique(y_true)) == 2:
                y_score = preds_df['Score'] if 'Score' in preds_df.columns else None
                if y_score is not None:
                    fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=np.unique(y_true)[1])
                    precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=np.unique(y_true)[1])
                    np.savetxt('results/roc_curve.csv', np.column_stack([fpr, tpr]), delimiter=',', header='fpr,tpr', comments='')
                    np.savetxt('results/pr_curve.csv', np.column_stack([precision, recall]), delimiter=',', header='precision,recall', comments='')
            # Save predictions for consistency
            preds_df.to_csv('results/predictions.csv', index=False)
        else:
            print("WARNING: y_true or y_pred is None. Confusion matrix not exported.")
    else:
        print("WARNING: Model does not have a 'predict' attribute. Confusion matrix not exported.")
except Exception as e:
    print(f"ERROR exporting confusion matrix: {e}")

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Transformation Pipeline and Model Successfully Saved
Model saved as models/final_lda_model.pkl


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.6957,0.7725,0.7151,0.3208,0.4429,0.2731,0.3147




In [15]:
display(Markdown("""
## Feature Importance
We analyze which features are most influential in predicting attrition.
"""))


## Feature Importance
We analyze which features are most influential in predicting attrition.


In [16]:
# --- SHAP summary plot and CSV export ---
from pycaret.classification import get_config

# Get the transformed training data used by PyCaret
X_train_transformed = get_config('X_train_transformed')

# Create a SHAP explainer for your model
explainer = shap.Explainer(model, X_train_transformed)
shap_values = explainer(X_train_transformed)

# Generate and save SHAP summary plot
shap.summary_plot(shap_values.values, X_train_transformed, show=False)
plt.tight_layout()
plt.savefig('results/shap_summary_plot.png', bbox_inches='tight')
plt.close()

# SHAP feature importance (mean absolute SHAP value per feature)
shap_importance = pd.DataFrame({
    'Feature': X_train_transformed.columns,
    'MeanAbsSHAP': np.abs(shap_values.values).mean(axis=0)
}).sort_values(by='MeanAbsSHAP', ascending=False)
shap_importance.to_csv('results/shap_feature_importance.csv', index=False)

In [17]:
display(Markdown("""
## Save Model
The final trained model is saved for future inference and deployment.
"""))


## Save Model
The final trained model is saved for future inference and deployment.


In [18]:
# Save model
save_trained_model(model, 'models/final_lda_model') 

Transformation Pipeline and Model Successfully Saved


In [19]:
display(Markdown("""
## Model Performance Insights

### Feature Importance & Model Performance
- **Model Used:** Linear Discriminant Analysis (LDA)
- **Performance:** Accuracy: 70%, AUC: 0.78, Recall: 0.75 (good for catching attrition cases), Precision: 0.34 (many false positives)
- **Key Features (by SHAP importance):**
    1. **OverTime** — Most influential; employees who work overtime are much more likely to leave.
    2. **EnvironmentSatisfaction** — Lower satisfaction increases attrition risk.
    3. **Age** — Younger employees tend to have higher attrition risk.
    4. **MonthlyIncome** — Lower income is associated with higher attrition.
    5. **DailyRate, DistanceFromHome, RoleStability, MonthlyRate** — These also contribute, but to a lesser extent.

#### SHAP Feature Importance Analysis
The table below (from `shap_feature_importance.csv`) shows the mean absolute SHAP value for each feature, which quantifies the average impact of each feature on the model's prediction for employee attrition. A higher value means the feature has a greater influence on the model's output.

| Rank | Feature                 | MeanAbsSHAP |
|------|-------------------------|-------------|
| 1    | OverTime                | 0.75        |
| 2    | EnvironmentSatisfaction | 0.56        |
| 3    | Age                     | 0.37        |
| 4    | MonthlyIncome           | 0.26        |
| 5    | DailyRate               | 0.20        |
| 6    | DistanceFromHome        | 0.19        |
| 7    | RoleStability           | 0.16        |
| 8    | MonthlyRate             | 0.15        |

**Interpretation:**
- **OverTime** is by far the most important feature, with a mean absolute SHAP value of 0.75. This means that whether or not an employee works overtime has the largest average effect on the model's prediction of attrition.
- **EnvironmentSatisfaction** is the second most important, indicating that employees' satisfaction with their work environment is a key driver of attrition risk.
- **Age** is also significant, suggesting that attrition risk varies notably with employee age (often, younger employees are more likely to leave).
- **MonthlyIncome** and **DailyRate** both have moderate influence, showing that compensation factors play a role, but are less critical than overtime or satisfaction.
- **DistanceFromHome**, **RoleStability**, and **MonthlyRate** have smaller but still meaningful impacts.

**Actionable Insights:**
- **Monitor and manage overtime:** Since overtime is the top driver, reducing excessive overtime or compensating for it may help reduce attrition.
- **Improve environment satisfaction:** Initiatives to boost workplace satisfaction could have a strong effect on retention.
- **Targeted retention for younger employees:** Since age is a key factor, consider tailored retention programs for younger staff.
- **Review compensation and stability:** While not the top factors, fair pay and stable roles still contribute to retention and should not be neglected.

## Modeling Recommendations
2. **Modeling Improvements:**
   - While recall is high, precision is low. Consider:
     - Collecting more data to balance class distribution
     - Engineering interaction features between key variables
     - Experimenting with alternative models (XGBoost, Random Forest)
     - Adjusting classification decision thresholds
     - Implementing feature selection to reduce noise
"""))


## Model Performance Insights

### Feature Importance & Model Performance
- **Model Used:** Linear Discriminant Analysis (LDA)
- **Performance:** Accuracy: 70%, AUC: 0.78, Recall: 0.75 (good for catching attrition cases), Precision: 0.34 (many false positives)
- **Key Features (by SHAP importance):**
    1. **OverTime** — Most influential; employees who work overtime are much more likely to leave.
    2. **EnvironmentSatisfaction** — Lower satisfaction increases attrition risk.
    3. **Age** — Younger employees tend to have higher attrition risk.
    4. **MonthlyIncome** — Lower income is associated with higher attrition.
    5. **DailyRate, DistanceFromHome, RoleStability, MonthlyRate** — These also contribute, but to a lesser extent.

#### SHAP Feature Importance Analysis
The table below (from `shap_feature_importance.csv`) shows the mean absolute SHAP value for each feature, which quantifies the average impact of each feature on the model's prediction for employee attrition. A higher value means the feature has a greater influence on the model's output.

| Rank | Feature                 | MeanAbsSHAP |
|------|-------------------------|-------------|
| 1    | OverTime                | 0.75        |
| 2    | EnvironmentSatisfaction | 0.56        |
| 3    | Age                     | 0.37        |
| 4    | MonthlyIncome           | 0.26        |
| 5    | DailyRate               | 0.20        |
| 6    | DistanceFromHome        | 0.19        |
| 7    | RoleStability           | 0.16        |
| 8    | MonthlyRate             | 0.15        |

**Interpretation:**
- **OverTime** is by far the most important feature, with a mean absolute SHAP value of 0.75. This means that whether or not an employee works overtime has the largest average effect on the model's prediction of attrition.
- **EnvironmentSatisfaction** is the second most important, indicating that employees' satisfaction with their work environment is a key driver of attrition risk.
- **Age** is also significant, suggesting that attrition risk varies notably with employee age (often, younger employees are more likely to leave).
- **MonthlyIncome** and **DailyRate** both have moderate influence, showing that compensation factors play a role, but are less critical than overtime or satisfaction.
- **DistanceFromHome**, **RoleStability**, and **MonthlyRate** have smaller but still meaningful impacts.

**Actionable Insights:**
- **Monitor and manage overtime:** Since overtime is the top driver, reducing excessive overtime or compensating for it may help reduce attrition.
- **Improve environment satisfaction:** Initiatives to boost workplace satisfaction could have a strong effect on retention.
- **Targeted retention for younger employees:** Since age is a key factor, consider tailored retention programs for younger staff.
- **Review compensation and stability:** While not the top factors, fair pay and stable roles still contribute to retention and should not be neglected.

## Modeling Recommendations
2. **Modeling Improvements:**
   - While recall is high, precision is low. Consider:
     - Collecting more data to balance class distribution
     - Engineering interaction features between key variables
     - Experimenting with alternative models (XGBoost, Random Forest)
     - Adjusting classification decision thresholds
     - Implementing feature selection to reduce noise
