In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from random import randint
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

In [9]:
# To test 

from sklearn.metrics import classification_report
from preprocessing import preprocess_data

def production(X_path, y_path):
    class TestModel:
        def predict(self, X):
            return [randint(0, 1) for _ in range(len(X))]

    # Load model and scalers
    best_xgb_clf = joblib.load('semisupervised_xgb_model.joblib')
    best_lr = joblib.load('semisupervised_lr_model.joblib')
    best_rf = joblib.load('semisupervised_rf_model.joblib')
    scaler = joblib.load('scaler.joblib')
    imputer = joblib.load('imputer.joblib')

    # Load and preprocess data
    df_X = pd.read_csv(X_path)
    df_y = pd.read_csv(y_path)['Left']

    X_scaled, _, _ = preprocess_data(df_X, imputer=imputer, scaler=scaler, fit=False)

    # Predict with models
    pred_xgb = best_xgb_clf.predict(X_scaled)
    pred_lr = best_lr.predict(X_scaled)
    pred_rf = best_rf.predict(X_scaled)
    pred_random = TestModel().predict(X_scaled)

    # Predict probabilities for ROC AUC
    proba_xgb = best_xgb_clf.predict_proba(X_scaled)[:, 1]
    proba_lr = best_lr.predict_proba(X_scaled)[:, 1]
    proba_rf = best_rf.predict_proba(X_scaled)[:, 1]

    # Evaluation
    print("=== XGBoost Model ===")
    print(classification_report(df_y, pred_xgb))
    print("ROC AUC:", roc_auc_score(df_y, proba_xgb))

    print("\n=== Logistic Regression Model ===")
    print(classification_report(df_y, pred_lr))
    print("ROC AUC:", roc_auc_score(df_y, proba_lr))

    print("\n=== Random Forest Model ===")
    print(classification_report(df_y, pred_rf))
    print("ROC AUC:", roc_auc_score(df_y, proba_rf))

    print("\n=== Random Guesser ===")
    print(classification_report(df_y, pred_random))

# Run the test
production( 

  X_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_X_prod.csv',

  y_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_y_prod.csv'

)

=== XGBoost Model ===
              precision    recall  f1-score   support

           0       0.66      0.56      0.61     64044
           1       0.38      0.47      0.42     35956

    accuracy                           0.53    100000
   macro avg       0.52      0.52      0.51    100000
weighted avg       0.56      0.53      0.54    100000

ROC AUC: 0.526856087757597

=== Logistic Regression Model ===
              precision    recall  f1-score   support

           0       0.70      0.56      0.62     64044
           1       0.42      0.56      0.48     35956

    accuracy                           0.56    100000
   macro avg       0.56      0.56      0.55    100000
weighted avg       0.60      0.56      0.57    100000

ROC AUC: 0.5865356807690041

=== Random Forest Model ===
              precision    recall  f1-score   support

           0       0.66      0.52      0.58     64044
           1       0.38      0.52      0.44     35956

    accuracy                           0.

### Metrics Selection for Predicting Employee Departure
Employee departure prediction often involves **imbalanced data**, where the majority of employees stay. Therefore, standard accuracy is not sufficient. The following metrics are recommended:

1. **ROC AUC**
Evaluates the model’s ability to distinguish between “stay” and “leave” across all thresholds.
2. **Precision (for 'Left')**
Of all employees predicted to leave, how many actually left? Helps avoid false positives.
3. **Recall (for 'Left')**
Of all employees who actually left, how many did we correctly identify? Helps avoid false negatives.
4. **F1-Score**
The harmonic mean of precision and recall.

### Conclusion why it didn't work- next steps
The models show some ability to identify potential leavers, but the performance is barely better than chance. 
### The reasons might be **Limited Labeled Data and Weak Label Propagation**. 
Only 500 truly labeled samples were available from SME, which is extremely limited compared to the total dataset of 500,000 records. Although semi-supervised techniques like label propagation were used to expand the labeled set, such methods are highly sensitive to the quality and distribution of the initial labels. **Poor propagation** can lead to **incorrect pseudo-labels**, introducing significant noise into training. Given the massive imbalance between labeled and unlabeled data, and the potential inaccuracies introduced during propagation, the model's ability to generalize suffers considerably. This limitation is likely a key reason why even optimized models performed only marginally better than random guessing.
### Next Steps
1. Improve feature engineering further:

Add domain-informed features (e.g., trend in reviews, salary change, project volatility).

2. Improve label quality and coverage:

Label more representative samples or get SME-labeled data beyond the 500 seeds.

