# Uploading a dataset


```
# This is formatted as code
```



In [None]:
import pandas as pd

df = pd.read_csv("/content/test 2.csv")
df.head()


## Basic Data Understanding


In [None]:
df.info()
df.describe()
df.isnull().sum()


## Data Cleaning

In [None]:
# Numerical ‚Üí median
num_cols = df.select_dtypes(include=['int64','float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical ‚Üí mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


## Encoding Categorical Features

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


## Feature‚ÄìTarget Split

In [None]:
X = df.drop("Attrition", axis=1)
y = df["Attrition"]


## Train‚ÄìTest Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
print("Total samples:", X.shape[0])
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

print("\nTraining target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)


In [None]:
print("Train attrition ratio:")
print(y_train.value_counts(normalize=True))

print("\nTest attrition ratio:")
print(y_test.value_counts(normalize=True))


## Feature Selection (Using Random Forest Importance)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

rf_fs = RandomForestClassifier(n_estimators=100, random_state=42)
rf_fs.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": rf_fs.feature_importances_
}).sort_values(by="Importance", ascending=False)

feature_importance


Select Important Features (Above Mean Importance)

In [None]:
threshold = feature_importance["Importance"].mean()

selected_features = feature_importance[
    feature_importance["Importance"] > threshold
]["Feature"].tolist()

print("Important Features Used for Training:\n")
for f in selected_features:
    print(f)

print("\nTotal original features:", X_train.shape[1])
print("Selected important features:", len(selected_features))


Reduce Dataset to Important Features

In [None]:
X_train_sel = X_train[selected_features]
X_test_sel = X_test[selected_features]


# Train Models Using Selected Features

**Decision Tree**

In [None]:
# ===============================
# Decision Tree Training & Evaluation
# ===============================

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns

# 1Ô∏è‚É£ Train Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_sel, y_train)

# 2Ô∏è‚É£ Predictions
y_pred_dt = dt.predict(X_test_sel)
y_prob_dt = dt.predict_proba(X_test_sel)[:, 1]

# 3Ô∏è‚É£ Accuracy
accuracy = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", accuracy)

# 4Ô∏è‚É£ Confusion Matrix
cm = confusion_matrix(y_test, y_pred_dt)
print("\nConfusion Matrix:\n", cm)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Decision Tree Confusion Matrix")
plt.show()

# 5Ô∏è‚É£ Precision, Recall, F1 Score
precision = precision_score(y_test, y_pred_dt)
recall = recall_score(y_test, y_pred_dt)
f1 = f1_score(y_test, y_pred_dt)

print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# 6Ô∏è‚É£ Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_dt))

# 7Ô∏è‚É£ ROC‚ÄìAUC Score
auc = roc_auc_score(y_test, y_prob_dt)
print("ROC-AUC Score:", auc)

# 8Ô∏è‚É£ ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob_dt)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"Decision Tree (AUC = {auc:.2f})")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äì Decision Tree")
plt.legend()
plt.show()

# ===============================
# üîπ High-Risk Employee Detection
# ===============================

risk_df = X_test_sel.copy()
risk_df["Attrition_Probability"] = y_prob_dt
risk_df["Risk_Level"] = risk_df["Attrition_Probability"].apply(
    lambda x: "High Risk" if x >= 0.7 else "Low Risk"
)

print("\nHigh-Risk Employees (Top 5):")
print(risk_df.sort_values("Attrition_Probability", ascending=False).head())

**SHAP ANALYSIS (Decision Tree)**

In [None]:
# ===============================
# SHAP Analysis for Decision Tree (FINAL & ERROR-FREE)
# Compatible with SHAP v0.20+
# ===============================

import shap
import numpy as np

# The 'dt' model is now defined from the previous cell
model = dt

# Create SHAP explainer
explainer = shap.TreeExplainer(model)

# Compute SHAP values
shap_values = explainer.shap_values(X_test_sel)

# ===============================
# Handle SHAP output safely
# ===============================

if isinstance(shap_values, list):
    # Binary classification (class 1 = Attrition)
    shap_vals = shap_values[1]
    base_value = explainer.expected_value[1]
else:
    # New SHAP versions (3D array)
    shap_vals = shap_values[:, :, 1]
    base_value = explainer.expected_value[1] # Select expected value for class 1

# ===============================
# 1Ô∏è‚É£ Global Feature Importance
# ===============================

shap.summary_plot(shap_vals, X_test_sel)

# ===============================
# 2Ô∏è‚É£ Feature Importance (Bar Plot)
# ===============================

shap.summary_plot(shap_vals, X_test_sel, plot_type="bar")

# ===============================
# 3Ô∏è‚É£ Individual Employee Explanation
# ===============================

employee_index = 4  # change index if required

shap.plots.force(
    base_value,
    shap_vals[employee_index],
    X_test_sel.iloc[employee_index]
)

**Random forest**

In [None]:
# ===============================
# Random Forest Training, Risk Detection & Evaluation
# ===============================

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# 1Ô∏è‚É£ Train Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)
rf.fit(X_train_sel, y_train)

# 2Ô∏è‚É£ Predictions
y_pred_rf = rf.predict(X_test_sel)
y_prob_rf = rf.predict_proba(X_test_sel)[:, 1]

# 3Ô∏è‚É£ Accuracy
accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy)

# 4Ô∏è‚É£ Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
print("\nConfusion Matrix:\n", cm)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()

# 5Ô∏è‚É£ Precision, Recall, F1 Score
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)

print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# 6Ô∏è‚É£ Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_rf))

# 7Ô∏è‚É£ ROC‚ÄìAUC Score
auc = roc_auc_score(y_test, y_prob_rf)
print("ROC-AUC Score:", auc)

# 8Ô∏è‚É£ ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob_rf)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"Random Forest (AUC = {auc:.2f})")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äì Random Forest")
plt.legend()
plt.show()

# ===============================
# üîπ High-Risk Employee Detection
# ===============================

risk_df = X_test_sel.copy()
risk_df["Attrition_Probability"] = y_prob_rf
risk_df["Risk_Level"] = risk_df["Attrition_Probability"].apply(
    lambda x: "High Risk" if x >= 0.7 else "Low Risk"
)

print("\nHigh-Risk Employees (Top 5):")
risk_df.sort_values("Attrition_Probability", ascending=False).head()


In [None]:
# ===============================
# SHAP Analysis for RANDOM FOREST (v0.20+ SAFE)
# ===============================

import shap

# Use trained Random Forest model
model = rf

# Create SHAP explainer
explainer = shap.TreeExplainer(model)

# IMPORTANT: new SHAP API
shap_values = explainer(X_test_sel)

# ===============================
# 1Ô∏è‚É£ Global Feature Importance
# ===============================

shap.summary_plot(
    shap_values.values[:, :, 1], # Select SHAP values for class 1 for all samples
    X_test_sel
)

# ===============================
# 2Ô∏è‚É£ Feature Importance (Bar Plot)
# ===============================

shap.summary_plot(
    shap_values.values[:, :, 1], # Select SHAP values for class 1 for all samples
    X_test_sel,
    plot_type="bar"
)

# ===============================
# 3Ô∏è‚É£ Individual Employee Explanation
# ===============================

employee_index = 4  # change if required

shap.plots.waterfall(
    shap.Explanation(
        values=shap_values.values[employee_index, :, 1], # Select SHAP values for employee_index, all features, class 1
        base_values=explainer.expected_value[1],
        data=X_test_sel.iloc[employee_index],
        feature_names=X_test_sel.columns
    )
)

**ANN**

In [None]:
# ===============================
# ANN Training, Risk Detection & Evaluation
# ===============================

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ===============================
# 1Ô∏è‚É£ Feature Scaling (MANDATORY for ANN)
# ===============================

scaler = StandardScaler()
X_train_ann = scaler.fit_transform(X_train_sel)
X_test_ann = scaler.transform(X_test_sel)

# ===============================
# 2Ô∏è‚É£ Train ANN Model
# ===============================

ann = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    solver='adam',
    max_iter=500,
    random_state=42
)

ann.fit(X_train_ann, y_train)

# ===============================
# 3Ô∏è‚É£ Predictions
# ===============================

y_pred_ann = ann.predict(X_test_ann)
y_prob_ann = ann.predict_proba(X_test_ann)[:, 1]

# ===============================
# 4Ô∏è‚É£ Evaluation Metrics
# ===============================

print("ANN Accuracy:", accuracy_score(y_test, y_pred_ann))
print("Precision:", precision_score(y_test, y_pred_ann))
print("Recall:", recall_score(y_test, y_pred_ann))
print("F1 Score:", f1_score(y_test, y_pred_ann))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob_ann))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_ann))

# ===============================
# 5Ô∏è‚É£ Confusion Matrix
# ===============================

cm = confusion_matrix(y_test, y_pred_ann)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("ANN Confusion Matrix")
plt.show()

# ===============================
# 6Ô∏è‚É£ ROC Curve
# ===============================

fpr, tpr, _ = roc_curve(y_test, y_prob_ann)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"ANN (AUC = {roc_auc_score(y_test, y_prob_ann):.2f})")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äì ANN")
plt.legend()
plt.show()

# ===============================
# 7Ô∏è‚É£ High-Risk Employee Detection
# ===============================

risk_df_ann = X_test_sel.copy()
risk_df_ann["Attrition_Probability"] = y_prob_ann
risk_df_ann["Risk_Level"] = risk_df_ann["Attrition_Probability"].apply(
    lambda x: "High Risk" if x >= 0.7 else "Low Risk"
)

print("\nHigh-Risk Employees (Top 5):")
risk_df_ann.sort_values("Attrition_Probability", ascending=False).head()


In [None]:
# ===============================
# ANN + SHAP (KernelExplainer) ‚Äì FINAL FIX
# ===============================

import shap
import numpy as np
import pandas as pd

# -------------------------------
# 1Ô∏è‚É£ Background data (scaled)
# -------------------------------
background = X_train_ann[
    np.random.choice(X_train_ann.shape[0], 50, replace=False)
]

# -------------------------------
# 2Ô∏è‚É£ KernelExplainer
# -------------------------------
explainer = shap.KernelExplainer(
    ann.predict_proba,
    background
)

# -------------------------------
# 3Ô∏è‚É£ Explain a small test subset (scaled)
# -------------------------------
X_test_sample_ann = X_test_ann[:50]

shap_values = explainer.shap_values(X_test_sample_ann)

# -------------------------------
# 4Ô∏è‚É£ Convert scaled data to DataFrame (IMPORTANT)
# -------------------------------
X_test_sample_df = pd.DataFrame(
    X_test_sample_ann,
    columns=X_test_sel.columns
)

# -------------------------------
# 5Ô∏è‚É£ Global Feature Importance
# -------------------------------
shap.summary_plot(
    shap_values[:, :, 1], # Corrected: Select SHAP values for class 1 for all samples
    X_test_sample_df
)

# -------------------------------
# 6Ô∏è‚É£ Feature Importance (Bar Plot)
# -------------------------------
shap.summary_plot(
    shap_values[:, :, 1], # Corrected: Select SHAP values for class 1 for all samples
    X_test_sample_df,
    plot_type="bar"
)

# -------------------------------
# 7Ô∏è‚É£ Individual Employee Explanation
# -------------------------------
employee_index = 4

shap.plots.waterfall(
    shap.Explanation(
        values=shap_values[employee_index, :, 1], # Corrected: Select SHAP values for employee_index, all features, class 1
        base_values=explainer.expected_value[1],
        data=X_test_sample_df.iloc[employee_index],
        feature_names=X_test_sample_df.columns
    )
)

**XG-Boost**

In [None]:
# ===============================
# XGBoost Training, Risk Detection & Evaluation
# ===============================

import xgboost as xgb
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# 1Ô∏è‚É£ Train XGBoost
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)
xgb_model.fit(X_train_sel, y_train)

# 2Ô∏è‚É£ Predictions
y_pred_xgb = xgb_model.predict(X_test_sel)
y_prob_xgb = xgb_model.predict_proba(X_test_sel)[:, 1]

# 3Ô∏è‚É£ Accuracy
accuracy = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy:", accuracy)

# 4Ô∏è‚É£ Confusion Matrix
cm = confusion_matrix(y_test, y_pred_xgb)
print("\nConfusion Matrix:\n", cm)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("XGBoost Confusion Matrix")
plt.show()

# 5Ô∏è‚É£ Precision, Recall, F1 Score
precision = precision_score(y_test, y_pred_xgb)
recall = recall_score(y_test, y_pred_xgb)
f1 = f1_score(y_test, y_pred_xgb)

print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# 6Ô∏è‚É£ Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_xgb))

# 7Ô∏è‚É£ ROC‚ÄìAUC Score
auc = roc_auc_score(y_test, y_prob_xgb)
print("ROC-AUC Score:", auc)

# 8Ô∏è‚É£ ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob_xgb)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"XGBoost (AUC = {auc:.2f})")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äì XGBoost")
plt.legend()
plt.show()

# ===============================
# üîπ High-Risk Employee Detection
# ===============================

risk_df_xgb = X_test_sel.copy()
risk_df_xgb["Attrition_Probability"] = y_prob_xgb
risk_df_xgb["Risk_Level"] = risk_df_xgb["Attrition_Probability"].apply(
    lambda x: "High Risk" if x >= 0.7 else "Low Risk"
)

print("\nHigh-Risk Employees (Top 5):")
print(risk_df_xgb.sort_values("Attrition_Probability", ascending=False).head())

In [None]:
# ===============================
# SHAP Analysis for XGBoost (v0.20+ SAFE)
# ===============================

import shap

# Use trained XGBoost model
model = xgb_model

# Create SHAP explainer
explainer = shap.TreeExplainer(model)

# IMPORTANT: new SHAP API
shap_values = explainer(X_test_sel)

# ===============================
# 1Ô∏è‚É£ Global Feature Importance
# ===============================

shap.summary_plot(
    shap_values.values,
    X_test_sel
)

# ===============================
# 2Ô∏è‚É£ Feature Importance (Bar Plot)
# ===============================

shap.summary_plot(
    shap_values.values,
    X_test_sel,
    plot_type="bar"
)

# ===============================
# 3Ô∏è‚É£ Individual Employee Explanation
# ===============================

employee_index = 4  # change if required

shap.plots.waterfall(
    shap_values[employee_index]
)

# **Naive Bayes**

In [None]:
# ===============================
# Naive Bayes Training, Risk Detection & Evaluation
# ===============================

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# 1Ô∏è‚É£ Train Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train_sel, y_train)

# 2Ô∏è‚É£ Predictions
y_pred_gnb = gnb.predict(X_test_sel)
y_prob_gnb = gnb.predict_proba(X_test_sel)[:, 1]

# 3Ô∏è‚É£ Accuracy
accuracy = accuracy_score(y_test, y_pred_gnb)
print("Naive Bayes Accuracy:", accuracy)

# 4Ô∏è‚É£ Confusion Matrix
cm = confusion_matrix(y_test, y_pred_gnb)
print("\nConfusion Matrix:\n", cm)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Naive Bayes Confusion Matrix")
plt.show()

# 5Ô∏è‚É£ Precision, Recall, F1 Score
precision = precision_score(y_test, y_pred_gnb)
recall = recall_score(y_test, y_pred_gnb)
f1 = f1_score(y_test, y_pred_gnb)

print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# 6Ô∏è‚É£ Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_gnb))

# 7Ô∏è‚É£ ROC‚ÄìAUC Score
auc = roc_auc_score(y_test, y_prob_gnb)
print("ROC-AUC Score:", auc)

# 8Ô∏è‚É£ ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob_gnb)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"Naive Bayes (AUC = {auc:.2f})")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äì Naive Bayes")
plt.legend()
plt.show()

# ===============================
# üîπ High-Risk Employee Detection
# ===============================

risk_df_gnb = X_test_sel.copy()
risk_df_gnb["Attrition_Probability"] = y_prob_gnb
risk_df_gnb["Risk_Level"] = risk_df_gnb["Attrition_Probability"].apply(
    lambda x: "High Risk" if x >= 0.7 else "Low Risk"
)

print("\nHigh-Risk Employees (Top 5):")
print(risk_df_gnb.sort_values("Attrition_Probability", ascending=False).head())

**SHAP ANALYSIS (Naive Bayes)**

In [None]:
# ===============================
# Naive Bayes + SHAP (KernelExplainer)
# ===============================

import shap
import numpy as np
import pandas as pd

# -------------------------------
# 1Ô∏è‚É£ Background data
# -------------------------------
# Using X_train_sel as background data. Randomly sample 50 instances for efficiency.
background = X_train_sel.iloc[np.random.choice(X_train_sel.shape[0], 50, replace=False)]

# -------------------------------
# 2Ô∏è‚É£ KernelExplainer
# -------------------------------
# Since GaussianNB has a predict_proba method, we can explain its output.
explainer = shap.KernelExplainer(
    gnb.predict_proba,
    background
)

# -------------------------------
# 3Ô∏è‚É£ Explain a small test subset
# -------------------------------
# Use a small subset of X_test_sel for explanation for computational efficiency.
X_test_sample_gnb = X_test_sel.iloc[:50]

shap_values = explainer.shap_values(X_test_sample_gnb)

# -------------------------------
# 4Ô∏è‚É£ Global Feature Importance
# -------------------------------
# For binary classification, we focus on the SHAP values for the positive class (index 1)
shap.summary_plot(
    shap_values[:, :, 1], # Corrected: Select SHAP values for class 1 for all samples
    X_test_sample_gnb
)

# -------------------------------
# 5Ô∏è‚É£ Feature Importance (Bar Plot)
# -------------------------------
shap.summary_plot(
    shap_values[:, :, 1], # Corrected: Select SHAP values for class 1 for all samples
    X_test_sample_gnb,
    plot_type="bar"
)

# -------------------------------
# 6Ô∏è‚É£ Individual Employee Explanation
# -------------------------------
employee_index = 4  # change index if required

shap.plots.waterfall(
    shap.Explanation(
        values=shap_values[employee_index, :, 1], # Corrected: SHAP values for class 1 for a specific employee
        base_values=explainer.expected_value[1],
        data=X_test_sample_gnb.iloc[employee_index],
        feature_names=X_test_sample_gnb.columns
    )
)

# Loggistic Regression

**Lasso Regression**

In [None]:
# ===============================
# Logistic Regression (Lasso L1 Regularization)
# Training, Risk Detection & Evaluation
# ===============================

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ===============================
# 1Ô∏è‚É£ Feature Scaling (Important for L1 regularization)
# ===============================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sel)
X_test_scaled = scaler.transform(X_test_sel)

# ===============================
# 2Ô∏è‚É£ Train Logistic Regression with L1 Regularization (Lasso)
# ===============================

# C is the inverse of regularization strength; smaller values specify stronger regularization.
# solver='liblinear' supports L1 regularization for binary classification.
lasso_lr = LogisticRegression(
    penalty='l1',
    solver='liblinear',
    C=0.1, # You can tune this parameter
    random_state=42
)
lasso_lr.fit(X_train_scaled, y_train)

# ===============================
# 3Ô∏è‚É£ Predictions
# ===============================

y_pred_lasso = lasso_lr.predict(X_test_scaled)
y_prob_lasso = lasso_lr.predict_proba(X_test_scaled)[:, 1]

# ===============================
# 4Ô∏è‚É£ Evaluation Metrics
# ===============================

print("Lasso Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lasso))
print("Precision:", precision_score(y_test, y_pred_lasso))
print("Recall:", recall_score(y_test, y_pred_lasso))
print("F1 Score:", f1_score(y_test, y_pred_lasso))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob_lasso))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_lasso))

# ===============================
# 5Ô∏è‚É£ Confusion Matrix
# ===============================

cm = confusion_matrix(y_test, y_pred_lasso)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Lasso Logistic Regression Confusion Matrix")
plt.show()

# ===============================
# 6Ô∏è‚É£ ROC Curve
# ===============================

fpr, tpr, _ = roc_curve(y_test, y_prob_lasso)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"Lasso Logistic Regression (AUC = {roc_auc_score(y_test, y_prob_lasso):.2f})")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äì Lasso Logistic Regression")
plt.legend()
plt.show()

# ===============================
# 7Ô∏è‚É£ High-Risk Employee Detection
# ===============================

risk_df_lasso = X_test_sel.copy()
risk_df_lasso["Attrition_Probability"] = y_prob_lasso
risk_df_lasso["Risk_Level"] = risk_df_lasso["Attrition_Probability"].apply(
    lambda x: "High Risk" if x >= 0.7 else "Low Risk"
)

print("\nHigh-Risk Employees (Top 5):")
print(risk_df_lasso.sort_values("Attrition_Probability", ascending=False).head())

**SHAP ANALYSIS (Lasso Logistic Regression)**

In [None]:
# ===============================
# SHAP Analysis for Lasso Logistic Regression (LinearExplainer)
# ===============================

import shap
import numpy as np
import pandas as pd

# -------------------------------
# 1Ô∏è‚É£ Create SHAP explainer for linear models
# -------------------------------
# LinearExplainer takes the model and the background data (usually X_train_scaled or a subset)
# It uses the coefficients and intercept of the linear model.
explainer = shap.LinearExplainer(
    lasso_lr,
    shap.maskers.Independent(X_train_scaled), # Corrected: Use shap.maskers.Independent
    feature_names=X_train_sel.columns # Added feature names for better plots
)

# -------------------------------
# 2Ô∏è‚É£ Compute SHAP values for the test set
# -------------------------------
# For LinearExplainer, shap_values is typically a single array for the positive class
shap_values = explainer.shap_values(X_test_scaled)

# -------------------------------
# 3Ô∏è‚É£ Convert scaled data back to DataFrame for plotting with feature names
# -------------------------------
X_test_scaled_df = pd.DataFrame(
    X_test_scaled,
    columns=X_test_sel.columns
)

# ===============================
# 4Ô∏è‚É£ Global Feature Importance (Summary Plot)
# ===============================
# For LinearExplainer with a binary model, shap_values is already for the positive class.
shap.summary_plot(
    shap_values, # Corrected: Pass shap_values directly (it's already a matrix for class 1)
    X_test_scaled_df
)

# ===============================
# 5Ô∏è‚É£ Feature Importance (Bar Plot)
# ===============================
shap.summary_plot(
    shap_values, # Corrected: Pass shap_values directly
    X_test_scaled_df,
    plot_type="bar"
)

# ===============================
# 6Ô∏è‚É£ Individual Employee Explanation (Force Plot)
# ===============================

employee_index = 4  # change index if required

shap.plots.force(
    explainer.expected_value, # Corrected: Use explainer.expected_value directly (it's a scalar)
    shap_values[employee_index], # Corrected: Access SHAP values for the employee directly
    X_test_scaled_df.iloc[employee_index]
)

**Ridge Regression**

In [None]:
# ===============================
# Logistic Regression (Ridge L2 Regularization)
# Training, Risk Detection & Evaluation
# ===============================

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ===============================
# 1Ô∏è‚É£ Feature Scaling (Important for L2 regularization)
# ===============================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sel)
X_test_scaled = scaler.transform(X_test_sel)

# ===============================
# 2Ô∏è‚É£ Train Logistic Regression with L2 Regularization (Ridge)
# ===============================

# C is the inverse of regularization strength; smaller values specify stronger regularization.
# solver='liblinear' supports L2 regularization for binary classification.
ridge_lr = LogisticRegression(
    penalty='l2',
    solver='liblinear',
    C=1.0, # You can tune this parameter
    random_state=42
)
ridge_lr.fit(X_train_scaled, y_train)

# ===============================
# 3Ô∏è‚É£ Predictions
# ===============================

y_pred_ridge = ridge_lr.predict(X_test_scaled)
y_prob_ridge = ridge_lr.predict_proba(X_test_scaled)[:, 1]

# ===============================
# 4Ô∏è‚É£ Evaluation Metrics
# ===============================

print("Ridge Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_ridge))
print("Precision:", precision_score(y_test, y_pred_ridge))
print("Recall:", recall_score(y_test, y_pred_ridge))
print("F1 Score:", f1_score(y_test, y_pred_ridge))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob_ridge))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_ridge))

# ===============================
# 5Ô∏è‚É£ Confusion Matrix
# ===============================

cm = confusion_matrix(y_test, y_pred_ridge)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Ridge Logistic Regression Confusion Matrix")
plt.show()

# ===============================
# 6Ô∏è‚É£ ROC Curve
# ===============================

fpr, tpr, _ = roc_curve(y_test, y_prob_ridge)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"Ridge Logistic Regression (AUC = {roc_auc_score(y_test, y_prob_ridge):.2f})")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äì Ridge Logistic Regression")
plt.legend()
plt.show()

# ===============================
# 7Ô∏è‚É£ High-Risk Employee Detection
# ===============================

risk_df_ridge = X_test_sel.copy()
risk_df_ridge["Attrition_Probability"] = y_prob_ridge
risk_df_ridge["Risk_Level"] = risk_df_ridge["Attrition_Probability"].apply(
    lambda x: "High Risk" if x >= 0.7 else "Low Risk"
)

print("\nHigh-Risk Employees (Top 5):")
print(risk_df_ridge.sort_values("Attrition_Probability", ascending=False).head())

**SHAP ANALYSIS (Ridge Logistic Regression)**

In [None]:
# ===============================
# SHAP Analysis for Ridge Logistic Regression (LinearExplainer)
# ===============================

import shap
import numpy as np
import pandas as pd

# -------------------------------
# 1Ô∏è‚É£ Create SHAP explainer for linear models
# -------------------------------
# LinearExplainer takes the model and the background data (usually X_train_scaled or a subset)
# It uses the coefficients and intercept of the linear model.
explainer = shap.LinearExplainer(
    ridge_lr,
    shap.maskers.Independent(X_train_scaled), # Use shap.maskers.Independent for background data
    feature_names=X_train_sel.columns # Added feature names for better plots
)

# -------------------------------
# 2Ô∏è‚É£ Compute SHAP values for the test set
# -------------------------------
# For LinearExplainer, shap_values is typically a single array for the positive class
shap_values = explainer.shap_values(X_test_scaled)

# -------------------------------
# 3Ô∏è‚É£ Convert scaled data back to DataFrame for plotting with feature names
# -------------------------------
X_test_scaled_df = pd.DataFrame(
    X_test_scaled,
    columns=X_test_sel.columns
)

# ===============================
# 4Ô∏è‚É£ Global Feature Importance (Summary Plot)
# ===============================
# For LinearExplainer with a binary model, shap_values is already for the positive class.
shap.summary_plot(
    shap_values,
    X_test_scaled_df
)

# ===============================
# 5Ô∏è‚É£ Feature Importance (Bar Plot)
# ===============================
shap.summary_plot(
    shap_values,
    X_test_scaled_df,
    plot_type="bar"
)

# ===============================
# 6Ô∏è‚É£ Individual Employee Explanation (Force Plot)
# ===============================

employee_index = 4  # change index if required

shap.plots.force(
    explainer.expected_value,
    shap_values[employee_index],
    X_test_scaled_df.iloc[employee_index]
)

**MIX**

In [None]:
# ===============================
# Logistic Regression (Elastic Net L1 + L2 Regularization)
# Training, Risk Detection & Evaluation
# ===============================

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ===============================
# 1Ô∏è‚É£ Feature Scaling (Important for Regularization)
# ===============================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sel)
X_test_scaled = scaler.transform(X_test_sel)

# ===============================
# 2Ô∏è‚É£ Train Logistic Regression with Elastic Net Regularization
# ===============================

# penalty='elasticnet' combines L1 and L2.
# solver='saga' is required for elasticnet penalty.
# l1_ratio: The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
# l1_ratio = 0 is equivalent to L2 (Ridge), l1_ratio = 1 is equivalent to L1 (Lasso).
# C is the inverse of regularization strength; smaller values specify stronger regularization.
elastic_lr = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    l1_ratio=0.5, # Adjust this value (0 to 1) for the mix of L1 and L2
    C=0.1, # You can tune this parameter
    random_state=42,
    max_iter=1000 # Increase max_iter for convergence if needed
)
elastic_lr.fit(X_train_scaled, y_train)

# ===============================
# 3Ô∏è‚É£ Predictions
# ===============================

y_pred_elastic = elastic_lr.predict(X_test_scaled)
y_prob_elastic = elastic_lr.predict_proba(X_test_scaled)[:, 1]

# ===============================
# 4Ô∏è‚É£ Evaluation Metrics
# ===============================

print("Elastic Net Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_elastic))
print("Precision:", precision_score(y_test, y_pred_elastic))
print("Recall:", recall_score(y_test, y_pred_elastic))
print("F1 Score:", f1_score(y_test, y_pred_elastic))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob_elastic))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_elastic))

# ===============================
# 5Ô∏è‚É£ Confusion Matrix
# ===============================

cm = confusion_matrix(y_test, y_pred_elastic)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Elastic Net Logistic Regression Confusion Matrix")
plt.show()

# ===============================
# 6Ô∏è‚É£ ROC Curve
# ===============================

fpr, tpr, _ = roc_curve(y_test, y_prob_elastic)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"Elastic Net Logistic Regression (AUC = {roc_auc_score(y_test, y_prob_elastic):.2f})")
plt.plot([0,1], [0,1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äì Elastic Net Logistic Regression")
plt.legend()
plt.show()

# ===============================
# 7Ô∏è‚É£ High-Risk Employee Detection
# ===============================

risk_df_elastic = X_test_sel.copy()
risk_df_elastic["Attrition_Probability"] = y_prob_elastic
risk_df_elastic["Risk_Level"] = risk_df_elastic["Attrition_Probability"].apply(
    lambda x: "High Risk" if x >= 0.7 else "Low Risk"
)

print("\nHigh-Risk Employees (Top 5):")
print(risk_df_elastic.sort_values("Attrition_Probability", ascending=False).head())

**SHAP ANALYSIS (Elastic Net Logistic Regression)**

In [None]:
# ===============================
# SHAP Analysis for Elastic Net Logistic Regression (LinearExplainer)
# ===============================

import shap
import numpy as np
import pandas as pd

# -------------------------------
# 1Ô∏è‚É£ Create SHAP explainer for linear models
# -------------------------------
# LinearExplainer takes the model and the background data (usually X_train_scaled or a subset)
# It uses the coefficients and intercept of the linear model.
explainer = shap.LinearExplainer(
    elastic_lr,
    shap.maskers.Independent(X_train_scaled), # Use shap.maskers.Independent for background data
    feature_names=X_train_sel.columns # Added feature names for better plots
)

# -------------------------------
# 2Ô∏è‚É£ Compute SHAP values for the test set
# -------------------------------
# For LinearExplainer, shap_values is typically a single array for the positive class
shap_values = explainer.shap_values(X_test_scaled)

# -------------------------------
# 3Ô∏è‚É£ Convert scaled data back to DataFrame for plotting with feature names
# -------------------------------
X_test_scaled_df = pd.DataFrame(
    X_test_scaled,
    columns=X_test_sel.columns
)

# ===============================
# 4Ô∏è‚É£ Global Feature Importance (Summary Plot)
# ===============================
# For LinearExplainer with a binary model, shap_values is already for the positive class.
shap.summary_plot(
    shap_values,
    X_test_scaled_df
)

# ===============================
# 5Ô∏è‚É£ Feature Importance (Bar Plot)
# ===============================
shap.summary_plot(
    shap_values,
    X_test_scaled_df,
    plot_type="bar"
)

# ===============================
# 6Ô∏è‚É£ Individual Employee Explanation (Force Plot)
# ===============================

employee_index = 4  # change index if required

shap.plots.force(
    explainer.expected_value,
    shap_values[employee_index],
    X_test_scaled_df.iloc[employee_index]
)