# Employee Attrition ML (IBM HR) — Notebook
Ce notebook reprend exactement le pipeline du script `employee_attrition_ml.py`, mais **sans classes ni méthodes** (code linéaire), découpé en cellules exécutables.

In [None]:
# Imports + paramètres généraux (graphiques, warnings, métriques)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings

warnings.filterwarnings('ignore')

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression

# XGBoost (optionnel)
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("Warning: XGBoost not installed. Install with: pip install xgboost")

# Style des visualisations (comme dans le script)
sns.set_theme(style="darkgrid", palette="crest_r")
plt.rcParams['figure.figsize'] = (10, 6)

# Fix Windows console encoding for UTF-8 output (si besoin)
if sys.platform == 'win32':
    try:
        sys.stdout.reconfigure(encoding='utf-8')
    except:
        pass


In [None]:
# Importation du dataset + infos de base (shape, distribution Attrition)
DATA_PATH = "WA_Fn-UseC_-HR-Employee-Attrition.csv"  # adapte le chemin si besoin

df_original = pd.read_csv(DATA_PATH)

print("="*80)
print("LOADING EMPLOYEE ATTRITION DATASET")
print("="*80)

print(f"\n[OK] Dataset loaded successfully")
print(f"  Shape: {df_original.shape[0]} rows  x  {df_original.shape[1]} columns")
print(f"\n  Attrition distribution:")
print(df_original['Attrition'].value_counts())
print(f"\n  Attrition rate: {df_original['Attrition'].value_counts(normalize=True)['Yes']:.2%}")


In [None]:
# Structuration / inspection générale + règles d'imputation (robustesse)
df = df_original.copy()

print("\n" + "="*80)
print("RIGOROUS DATA STRUCTURING & EDA")
print("="*80)

print("\n1. INSPECTION GÉNÉRALE (Data Overview)")
print("-" * 40)
print(f"Data Shape: {df.shape}")

print("\nNull Values per Column:")
null_counts = df.isnull().sum()
print(null_counts[null_counts > 0] if null_counts.sum() > 0 else "  No missing values found.")

print("\n2. EXPLICIT CLEANING & IMPUTATION")
print("-" * 40)
print("  Even if clean, we enforce imputation rules for robustness:")

# Impute Categorical (Mode)
cat_cols = df.select_dtypes(include=['object']).columns
print(f"  Imputing Categorical Columns (Mode): {list(cat_cols[:3])}...")
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Impute Numerical (Median)
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(f"  Imputing Numerical Columns (Median): {list(num_cols[:3])}...")
for col in num_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

print("[OK] Imputation step complete")


In [None]:
# Traitement des outliers (méthode IQR) sur les variables clés (winsorisation)
print("\n3. OUTLIER TREATMENT (IQR Method)")
print("-" * 40)

target_outlier_cols = ['MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'Age', 'DistanceFromHome']

for col in target_outlier_cols:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers_low = (df[col] < lower_bound).sum()
        outliers_high = (df[col] > upper_bound).sum()

        if outliers_low > 0 or outliers_high > 0:
            print(f"  [OUTLIERS] {col}: Capping {outliers_low} low and {outliers_high} high values")
            df[col] = np.where(
                df[col] < lower_bound, lower_bound,
                np.where(df[col] > upper_bound, upper_bound, df[col])
            )
        else:
            print(f"  [CLEAN] {col}: No outliers detected inside 1.5*IQR range")

print("[OK] Outlier treatment complete")


In [None]:
# Graphes EDA (1) : distributions (histogrammes) — sauvegardés dans eda_plots/
import os
os.makedirs('eda_plots', exist_ok=True)

print("\n4. GENERATING DETAILED VISUALIZATIONS")
print("-" * 40)
print("  A. Generating Histograms...")

for col in ['Age', 'MonthlyIncome', 'DistanceFromHome', 'YearsAtCompany']:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True, bins=30, color='skyblue')
    plt.title(f"Distribution: {col}")
    plt.savefig(f"eda_plots/hist_{col}.png")
    plt.show()


In [None]:
# Graphes EDA (2) : boxplots vs Attrition — sauvegardés dans eda_plots/
print("  B. Generating Boxplots (vs Attrition)...")

for col in ['Age', 'MonthlyIncome', 'YearsAtCompany']:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x="Attrition", y=col, data=df, palette="Set2")
    plt.title(f"{col} vs Attrition")
    plt.savefig(f"eda_plots/box_{col}.png")
    plt.show()


In [None]:
# Graphes EDA (3) : corrélation (features numériques) — sauvegardée dans eda_plots/
print("  C. Generating Correlation Matrix...")

num_only = df.select_dtypes(include=['number'])
if not num_only.empty:
    plt.figure(figsize=(12, 10))
    corr_mx = num_only.corr()
    sns.heatmap(corr_mx, annot=False, cmap='coolwarm', cbar=True)
    plt.title("Full Feature Correlation Matrix")
    plt.savefig("eda_plots/correlation_matrix_full.png")
    plt.show()
else:
    print("No numerical columns found for correlation matrix.")


In [None]:
# Graphes EDA (4) : pairplot (variables clés) — sauvegardé dans eda_plots/
print("  D. Generating Pairplot (Key Features)...")

key_vars = ['Age', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany']
df_pair = df[key_vars + ['Attrition']].copy()

sns.pairplot(df_pair, hue="Attrition", diag_kind="kde", palette="bright")
plt.savefig("eda_plots/pairplot_key_features.png")
plt.show()

print("\n[OK] All rigorous EDA steps completed. Plots saved in 'eda_plots/' folder.")


In [None]:
# Réduction des features : 35 → 12 (constantes + corrélation + domain knowledge)
print("\n" + "="*80)
print("FEATURE REDUCTION: 35 → 12")
print("="*80)

df_original_clean = df.copy()
df_reduction = df_original_clean.copy()

feature_status = []  # dicts: Feature, Status, Reason, Correlation

# 1) Colonnes constantes
print("\n1. Analyzing Constant/Single-Value Columns:")
constant_cols = []
for col in df_reduction.columns:
    if df_reduction[col].nunique() <= 1:
        constant_cols.append(col)
        feature_status.append({'Feature': col, 'Status': 'Dropped', 'Reason': 'Single Value (Constant)', 'Correlation': 0.0})

potential_constants = ['EmployeeCount', 'Over18', 'StandardHours']
for col in potential_constants:
    if col in df_reduction.columns and col not in constant_cols:
        constant_cols.append(col)
        feature_status.append({'Feature': col, 'Status': 'Dropped', 'Reason': 'Single Value (Constant)', 'Correlation': 0.0})

print(f"   Found {len(constant_cols)} constant columns: {constant_cols}")

# 2) Corrélations (encodage temporaire complet)
print("\n2. Analyzing Correlations for ALL Features:")

df_analysis = df_original_clean.copy()

# Target encoding pour corrélation
if df_analysis['Attrition'].dtype == 'object':
    df_analysis['Attrition'] = df_analysis['Attrition'].map({'Yes': 1, 'No': 0})

# Encode toutes les colonnes object (sauf Attrition)
object_cols = df_analysis.select_dtypes(include=['object']).columns
for col in object_cols:
    if col != 'Attrition':
        le_tmp = LabelEncoder()
        df_analysis[col] = le_tmp.fit_transform(df_analysis[col].astype(str))

df_analysis = df_analysis.drop(columns=constant_cols, errors='ignore')

all_correlations = df_analysis.corrwith(df_analysis['Attrition']).sort_values(ascending=False)
all_correlations = all_correlations.drop('Attrition', errors='ignore')

features_to_keep = [
    'Age', 'Gender', 'MonthlyIncome', 'JobSatisfaction',
    'EnvironmentSatisfaction', 'WorkLifeBalance', 'OverTime',
    'YearsAtCompany', 'JobRole', 'Department', 'MaritalStatus',
    'DistanceFromHome'
]

for feature, corr_val in all_correlations.items():
    if feature in constant_cols:
        continue
    if feature in features_to_keep:
        status = 'Kept'
        reason = 'Domain Importance & Correlation'
    else:
        status = 'Dropped'
        if abs(corr_val) < 0.05:
            reason = 'Low Correlation (<0.05)'
        else:
            reason = 'Redundant/Multicollinearity'
    feature_status.append({'Feature': feature, 'Status': status, 'Reason': reason, 'Correlation': corr_val})

df_status = pd.DataFrame(feature_status)
df_status['AbsCorr'] = df_status['Correlation'].abs()
df_status = df_status.sort_values(['Status', 'AbsCorr'], ascending=[False, False])

print("\n" + "="*80)
print("FEATURE REDUCTION REPORT (35 Features)")
print("="*80)
print(f"{'Feature':<25} {'Status':<10} {'Correlation':<12} {'Reason'}")
print("-" * 80)
for _, row in df_status.iterrows():
    print(f"{row['Feature']:<25} {row['Status']:<10} {row['Correlation']:<12.4f} {row['Reason']}")
print("-" * 80)
print(f"Total Features: {len(feature_status)}")
print(f"Kept: {len(df_status[df_status['Status']=='Kept'])}")
print(f"Dropped: {len(df_status[df_status['Status']=='Dropped'])}")

# Visualisation (feature_importance_all.png)
plt.figure(figsize=(14, 10))
plot_data = df_status.sort_values('Correlation', ascending=True)

colors = ['green' if s == 'Kept' else 'red' for s in plot_data['Status']]
plt.barh(plot_data['Feature'], plot_data['Correlation'], color=colors)

from matplotlib.patches import Patch
plt.legend(handles=[Patch(facecolor='green', label='Kept Features'),
                    Patch(facecolor='red', label='Dropped Features')],
           loc='lower right')

plt.title('Feature Correlation with Attrition (All 34 Features)\nGreen = Kept, Red = Dropped', fontsize=14)
plt.xlabel('Correlation Coefficient with Attrition')
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.axvline(x=0, color='black', linewidth=0.8)
plt.tight_layout()
plt.savefig('feature_importance_all.png', dpi=300)
plt.show()
print("\n[OK] Comprehensive feature graph saved to: feature_importance_all.png")

# Application de la réduction
df_processed = df_original_clean[features_to_keep + ['Attrition']].copy()
print("[OK] Reduced dataframe shape:", df_processed.shape)


In [None]:
# Encodage des features catégorielles + encodage de la target Attrition
print("\n" + "="*80)
print("ENCODING CATEGORICAL FEATURES")
print("="*80)

df_encoded = df_processed.copy()

categorical_cols = df_encoded.select_dtypes(include=['object']).columns.tolist()
if 'Attrition' in categorical_cols:
    categorical_cols.remove('Attrition')

print(f"\nCategorical columns to encode: {categorical_cols}")

# Target encoding
le_target = LabelEncoder()
df_encoded['Attrition'] = le_target.fit_transform(df_encoded['Attrition'])
print(f"\n[OK] Target encoding: Yes=1, No=0")

# Binary encoding Gender, OverTime
if 'Gender' in df_encoded.columns:
    df_encoded['Gender'] = df_encoded['Gender'].map({'Male': 0, 'Female': 1})
    print("[OK] Gender encoding: Male=0, Female=1")

if 'OverTime' in df_encoded.columns:
    df_encoded['OverTime'] = df_encoded['OverTime'].map({'No': 0, 'Yes': 1})
    print("[OK] OverTime encoding: No=0, Yes=1")

# Label encoding autres colonnes catégorielles
remaining_categorical = [c for c in categorical_cols if c not in ['Gender', 'OverTime']]
encoders = {}

for col in remaining_categorical:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    encoders[col] = le
    print(f"[OK] {col} encoded: {len(le.classes_)} categories")

print("[OK] Encoding complete")
df_encoded.head()


In [None]:
# Split 80% train / 20% test + scaling (comme dans le script)
print("\n" + "="*80)
print("TRAIN-TEST SPLIT")
print("="*80)

X = df_encoded.drop('Attrition', axis=1)
y = df_encoded['Attrition']

test_size = 0.2
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y
)

print(f"\n[OK] Split ratio: {int((1-test_size)*100)}-{int(test_size*100)}")
print(f"  Training set: {X_train.shape[0]} samples")
print(f"  Test set: {X_test.shape[0]} samples")
print(f"\n  Class distribution (training):")
print(f"    No attrition (0): {(y_train == 0).sum()} ({(y_train == 0).sum()/len(y_train):.2%})")
print(f"    Attrition (1):    {(y_train == 1).sum()} ({(y_train == 1).sum()/len(y_train):.2%})")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n[OK] Features scaled for distance-based models (K-NN, SVM)")


In [None]:
# Vérification de K (Elbow Method) — sauvegarde knn_elbow_method.png
print("\n" + "="*80)
print("FINDING OPTIMAL K (ELBOW METHOD)")
print("="*80)

max_k = 40
error_rates = []

for i in range(1, max_k + 1):
    knn_tmp = KNeighborsClassifier(n_neighbors=i)
    knn_tmp.fit(X_train_scaled, y_train)
    pred_i = knn_tmp.predict(X_test_scaled)
    error_rates.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10, 6))
plt.plot(range(1, max_k + 1), error_rates, color='blue', linestyle='dashed',
         marker='o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value (Elbow Method)')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.savefig('knn_elbow_method.png')
plt.show()

optimal_k = error_rates.index(min(error_rates)) + 1
print(f"Minimum error rate: {min(error_rates):.4f} at K={optimal_k}")
print("Chosen K=5 is a common default that often balances bias/variance well.")
print("[OK] Elbow Method plot saved to 'knn_elbow_method.png'")


In [None]:
# Initialisation des conteneurs pour stocker les modèles + résultats (métriques)
models = {}
results = {}


In [None]:
# MODEL 1: K-NN (k=5) — entraînement + évaluation
print("\n" + "="*80)
print("MODEL 1: K-NEAREST NEIGHBORS (K=5)")
print("="*80)

MODEL_NAME = "K-NN (k=5)"
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

models[MODEL_NAME] = model
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0),
    'Confusion Matrix': confusion_matrix(y_test, y_pred)
}
results[MODEL_NAME] = metrics

print(f"\n  Accuracy:   {metrics['Accuracy']:.4f} ({metrics['Accuracy']*100:.2f}%)")
print(f"  Precision:  {metrics['Precision']:.4f} ({metrics['Precision']*100:.2f}%)")
print(f"  Recall:     {metrics['Recall']:.4f} ({metrics['Recall']*100:.2f}%)")
print(f"  F1-Score:   {metrics['F1-Score']:.4f} ({metrics['F1-Score']*100:.2f}%)")
cm = metrics['Confusion Matrix']
print("\n  Confusion Matrix:")
print(f"    [[TN={cm[0,0]:<4} FP={cm[0,1]:<4}]")
print(f"     [FN={cm[1,0]:<4} TP={cm[1,1]:<4}]]")



In [None]:
# MODEL 2: K-NN Weighted (distance) — entraînement + évaluation
print("\n" + "="*80)
print("MODEL 2: K-NEAREST NEIGHBORS WEIGHTED (DISTANCE)")
print("="*80)

MODEL_NAME = "K-NN Weighted"
model = KNeighborsClassifier(n_neighbors=5, weights='distance')
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

models[MODEL_NAME] = model
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0),
    'Confusion Matrix': confusion_matrix(y_test, y_pred)
}
results[MODEL_NAME] = metrics

print(f"\n  Accuracy:   {metrics['Accuracy']:.4f} ({metrics['Accuracy']*100:.2f}%)")
print(f"  Precision:  {metrics['Precision']:.4f} ({metrics['Precision']*100:.2f}%)")
print(f"  Recall:     {metrics['Recall']:.4f} ({metrics['Recall']*100:.2f}%)")
print(f"  F1-Score:   {metrics['F1-Score']:.4f} ({metrics['F1-Score']*100:.2f}%)")
cm = metrics['Confusion Matrix']
print("\n  Confusion Matrix:")
print(f"    [[TN={cm[0,0]:<4} FP={cm[0,1]:<4}]")
print(f"     [FN={cm[1,0]:<4} TP={cm[1,1]:<4}]]")



In [None]:
# MODEL 3: SVM (RBF) — entraînement + évaluation
print("\n" + "="*80)
print("MODEL 3: SUPPORT VECTOR MACHINE (RBF KERNEL)")
print("="*80)

MODEL_NAME = "SVM (RBF)"
model = SVC(kernel='rbf', C=1.0, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

models[MODEL_NAME] = model
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0),
    'Confusion Matrix': confusion_matrix(y_test, y_pred)
}
results[MODEL_NAME] = metrics

print(f"\n  Accuracy:   {metrics['Accuracy']:.4f} ({metrics['Accuracy']*100:.2f}%)")
print(f"  Precision:  {metrics['Precision']:.4f} ({metrics['Precision']*100:.2f}%)")
print(f"  Recall:     {metrics['Recall']:.4f} ({metrics['Recall']*100:.2f}%)")
print(f"  F1-Score:   {metrics['F1-Score']:.4f} ({metrics['F1-Score']*100:.2f}%)")
cm = metrics['Confusion Matrix']
print("\n  Confusion Matrix:")
print(f"    [[TN={cm[0,0]:<4} FP={cm[0,1]:<4}]")
print(f"     [FN={cm[1,0]:<4} TP={cm[1,1]:<4}]]")



In [None]:
# MODEL 4: Naive Bayes (Gaussian) — entraînement + évaluation
print("\n" + "="*80)
print("MODEL 4: NAIVE BAYES (GAUSSIAN)")
print("="*80)

MODEL_NAME = "Naive Bayes"
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

models[MODEL_NAME] = model
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0),
    'Confusion Matrix': confusion_matrix(y_test, y_pred)
}
results[MODEL_NAME] = metrics

print(f"\n  Accuracy:   {metrics['Accuracy']:.4f} ({metrics['Accuracy']*100:.2f}%)")
print(f"  Precision:  {metrics['Precision']:.4f} ({metrics['Precision']*100:.2f}%)")
print(f"  Recall:     {metrics['Recall']:.4f} ({metrics['Recall']*100:.2f}%)")
print(f"  F1-Score:   {metrics['F1-Score']:.4f} ({metrics['F1-Score']*100:.2f}%)")
cm = metrics['Confusion Matrix']
print("\n  Confusion Matrix:")
print(f"    [[TN={cm[0,0]:<4} FP={cm[0,1]:<4}]")
print(f"     [FN={cm[1,0]:<4} TP={cm[1,1]:<4}]]")



In [None]:
# MODEL 5: Decision Tree — entraînement + évaluation + top features
print("\n" + "="*80)
print("MODEL 5: DECISION TREE")
print("="*80)

MODEL_NAME = "Decision Tree"
model = DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

models[MODEL_NAME] = model
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0),
    'Confusion Matrix': confusion_matrix(y_test, y_pred)
}
results[MODEL_NAME] = metrics

print(f"\n  Accuracy:   {metrics['Accuracy']:.4f} ({metrics['Accuracy']*100:.2f}%)")
print(f"  Precision:  {metrics['Precision']:.4f} ({metrics['Precision']*100:.2f}%)")
print(f"  Recall:     {metrics['Recall']:.4f} ({metrics['Recall']*100:.2f}%)")
print(f"  F1-Score:   {metrics['F1-Score']:.4f} ({metrics['F1-Score']*100:.2f}%)")
cm = metrics['Confusion Matrix']
print("\n  Confusion Matrix:")
print(f"    [[TN={cm[0,0]:<4} FP={cm[0,1]:<4}]")
print(f"     [FN={cm[1,0]:<4} TP={cm[1,1]:<4}]]")


# Feature importance (top 5)
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n  Top 5 Important Features:")
for _, row in feature_importance.head(5).iterrows():
    print(f"    {row['Feature']:<25} {row['Importance']:.4f}")


In [None]:
# MODEL 6: Random Forest — entraînement + évaluation + top features
print("\n" + "="*80)
print("MODEL 6: RANDOM FOREST")
print("="*80)

MODEL_NAME = "Random Forest"
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

models[MODEL_NAME] = model
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0),
    'Confusion Matrix': confusion_matrix(y_test, y_pred)
}
results[MODEL_NAME] = metrics

print(f"\n  Accuracy:   {metrics['Accuracy']:.4f} ({metrics['Accuracy']*100:.2f}%)")
print(f"  Precision:  {metrics['Precision']:.4f} ({metrics['Precision']*100:.2f}%)")
print(f"  Recall:     {metrics['Recall']:.4f} ({metrics['Recall']*100:.2f}%)")
print(f"  F1-Score:   {metrics['F1-Score']:.4f} ({metrics['F1-Score']*100:.2f}%)")
cm = metrics['Confusion Matrix']
print("\n  Confusion Matrix:")
print(f"    [[TN={cm[0,0]:<4} FP={cm[0,1]:<4}]")
print(f"     [FN={cm[1,0]:<4} TP={cm[1,1]:<4}]]")


feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n  Top 5 Important Features:")
for _, row in feature_importance.head(5).iterrows():
    print(f"    {row['Feature']:<25} {row['Importance']:.4f}")


In [None]:
# MODEL 7: Gradient Descent (SGDClassifier) — entraînement + évaluation
print("\n" + "="*80)
print("MODEL 7: GRADIENT DESCENT (SGD CLASSIFIER)")
print("="*80)

MODEL_NAME = "Gradient Descent"
model = SGDClassifier(loss='log_loss', max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

models[MODEL_NAME] = model
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0),
    'Confusion Matrix': confusion_matrix(y_test, y_pred)
}
results[MODEL_NAME] = metrics

print(f"\n  Accuracy:   {metrics['Accuracy']:.4f} ({metrics['Accuracy']*100:.2f}%)")
print(f"  Precision:  {metrics['Precision']:.4f} ({metrics['Precision']*100:.2f}%)")
print(f"  Recall:     {metrics['Recall']:.4f} ({metrics['Recall']*100:.2f}%)")
print(f"  F1-Score:   {metrics['F1-Score']:.4f} ({metrics['F1-Score']*100:.2f}%)")
cm = metrics['Confusion Matrix']
print("\n  Confusion Matrix:")
print(f"    [[TN={cm[0,0]:<4} FP={cm[0,1]:<4}]")
print(f"     [FN={cm[1,0]:<4} TP={cm[1,1]:<4}]]")



In [None]:
# MODEL 8: K-Means (unsupervised) — implémentation manuelle inline (sans def/class)
print("\n" + "="*80)
print("MODEL 8: K-MEANS CLUSTERING (UNSUPERVISED)")
print("="*80)
print("  Using manual K-Means implementation (Windows compatibility)")

MODEL_NAME = "K-Means"
n_clusters = 2
max_iters = 100
np.random.seed(42)

Xtr = X_train_scaled
Xte = X_test_scaled

n_samples = Xtr.shape[0]
indices = np.random.choice(n_samples, n_clusters, replace=False)
centroids = Xtr[indices].copy()

for _ in range(max_iters):
    # distances shape: (k, n_samples)
    distances = np.sqrt(((Xtr - centroids[:, np.newaxis])**2).sum(axis=2))
    labels = np.argmin(distances, axis=0)

    new_centroids = np.array([Xtr[labels == i].mean(axis=0) for i in range(n_clusters)])
    if np.allclose(centroids, new_centroids):
        break
    centroids = new_centroids

# predict on test
distances_test = np.sqrt(((Xte - centroids[:, np.newaxis])**2).sum(axis=2))
test_labels = np.argmin(distances_test, axis=0)

# align clusters with actual labels (majority vote)
cluster_labels = {}
ytr_arr = y_train.values if hasattr(y_train, 'values') else y_train
for cluster_id in range(n_clusters):
    mask = labels == cluster_id
    if mask.sum() > 0:
        majority_label = np.bincount(ytr_arr[mask]).argmax()
        cluster_labels[cluster_id] = majority_label

y_pred = np.array([cluster_labels.get(c, 0) for c in test_labels])

# store centroids + mapping as a tuple (pas de classe)
models[MODEL_NAME] = (centroids, cluster_labels)
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0),
    'Confusion Matrix': confusion_matrix(y_test, y_pred)
}
results[MODEL_NAME] = metrics

print(f"\n  Accuracy:   {metrics['Accuracy']:.4f} ({metrics['Accuracy']*100:.2f}%)")
print(f"  Precision:  {metrics['Precision']:.4f} ({metrics['Precision']*100:.2f}%)")
print(f"  Recall:     {metrics['Recall']:.4f} ({metrics['Recall']*100:.2f}%)")
print(f"  F1-Score:   {metrics['F1-Score']:.4f} ({metrics['F1-Score']*100:.2f}%)")
cm = metrics['Confusion Matrix']
print("\n  Confusion Matrix:")
print(f"    [[TN={cm[0,0]:<4} FP={cm[0,1]:<4}]")
print(f"     [FN={cm[1,0]:<4} TP={cm[1,1]:<4}]]")


print("\n  Note: K-Means is unsupervised; performance may be lower than supervised methods.")


In [None]:
# MODEL 9: Logistic Regression — entraînement + évaluation
print("\n" + "="*80)
print("MODEL 9: LOGISTIC REGRESSION")
print("="*80)

MODEL_NAME = "Logistic Regression"
model = LogisticRegression(max_iter=1000, random_state=42, solver='lbfgs')
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

models[MODEL_NAME] = model
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0),
    'Confusion Matrix': confusion_matrix(y_test, y_pred)
}
results[MODEL_NAME] = metrics

print(f"\n  Accuracy:   {metrics['Accuracy']:.4f} ({metrics['Accuracy']*100:.2f}%)")
print(f"  Precision:  {metrics['Precision']:.4f} ({metrics['Precision']*100:.2f}%)")
print(f"  Recall:     {metrics['Recall']:.4f} ({metrics['Recall']*100:.2f}%)")
print(f"  F1-Score:   {metrics['F1-Score']:.4f} ({metrics['F1-Score']*100:.2f}%)")
cm = metrics['Confusion Matrix']
print("\n  Confusion Matrix:")
print(f"    [[TN={cm[0,0]:<4} FP={cm[0,1]:<4}]")
print(f"     [FN={cm[1,0]:<4} TP={cm[1,1]:<4}]]")



In [None]:
# MODEL 10: XGBoost — entraînement + évaluation (si disponible)
print("\n" + "="*80)
print("MODEL 10: XGBOOST")
print("="*80)

MODEL_NAME = "XGBoost"

if not XGBOOST_AVAILABLE:
    print("  XGBoost not available - skipping")
else:
    model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    models[MODEL_NAME] = model
    metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred, zero_division=0),
    'F1-Score': f1_score(y_test, y_pred, zero_division=0),
    'Confusion Matrix': confusion_matrix(y_test, y_pred)
}
results[MODEL_NAME] = metrics

print(f"\n  Accuracy:   {metrics['Accuracy']:.4f} ({metrics['Accuracy']*100:.2f}%)")
print(f"  Precision:  {metrics['Precision']:.4f} ({metrics['Precision']*100:.2f}%)")
print(f"  Recall:     {metrics['Recall']:.4f} ({metrics['Recall']*100:.2f}%)")
print(f"  F1-Score:   {metrics['F1-Score']:.4f} ({metrics['F1-Score']*100:.2f}%)")
cm = metrics['Confusion Matrix']
print("\n  Confusion Matrix:")
print(f"    [[TN={cm[0,0]:<4} FP={cm[0,1]:<4}]")
print(f"     [FN={cm[1,0]:<4} TP={cm[1,1]:<4}]]")



In [None]:
# Table de comparaison des modèles (Accuracy/Precision/Recall/F1) + export CSV
print("\n" + "="*80)
print("MODEL COMPARISON TABLE")
print("="*80)

comparison_data = []
for model_name, metrics in results.items():
    comparison_data.append({
        'Model': model_name,
        'Accuracy': f"{metrics['Accuracy']*100:.2f}%",
        'Precision': f"{metrics['Precision']*100:.2f}%",
        'Recall': f"{metrics['Recall']*100:.2f}%",
        'F1-Score': f"{metrics['F1-Score']*100:.2f}%"
    })

df_comparison = pd.DataFrame(comparison_data)

df_comparison['Accuracy_num'] = df_comparison['Accuracy'].str.rstrip('%').astype(float)
df_comparison = df_comparison.sort_values('Accuracy_num', ascending=False).drop('Accuracy_num', axis=1)
df_comparison.insert(0, 'Rank', range(1, len(df_comparison) + 1))

print("\n" + df_comparison.to_string(index=False))

output_file = 'model_comparison_results.csv'
df_comparison.to_csv(output_file, index=False)
print(f"\n[OK] Results saved to: {output_file}")

df_comparison


In [None]:
# Graphe : comparaison des Accuracy (bar chart horizontal) — sauvegarde model_comparison.png
comparison_plot = []
for model_name, metrics in results.items():
    comparison_plot.append({'Model': model_name, 'Accuracy': metrics['Accuracy'] * 100})

df_plot = pd.DataFrame(comparison_plot).sort_values('Accuracy', ascending=True)

plt.figure(figsize=(12, 6))
bars = plt.barh(df_plot['Model'], df_plot['Accuracy'], color='steelblue')

# Color the best model
bars[-1].set_color('darkgreen')

plt.xlabel('Accuracy (%)', fontsize=12, fontweight='bold')
plt.ylabel('Model', fontsize=12, fontweight='bold')
plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')
plt.xlim(0, 100)

for i, (_, row) in enumerate(df_plot.iterrows()):
    plt.text(row['Accuracy'] + 1, i, f"{row['Accuracy']:.2f}%", va='center', fontsize=10)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
print("[OK] Comparison chart saved to: model_comparison.png")


In [None]:
# Graphes : matrices de confusion pour tous les modèles — sauvegarde dans confusion_matrices/
import os
os.makedirs('confusion_matrices', exist_ok=True)

for model_name, metrics in results.items():
    cm = metrics['Confusion Matrix']

    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm, annot=True, fmt='d', cmap='Blues',
        xticklabels=['No Attrition', 'Attrition'],
        yticklabels=['No Attrition', 'Attrition'],
        cbar_kws={'label': 'Count'}
    )
    plt.title(f'Confusion Matrix: {model_name}', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)

    filename = f"confusion_matrices/{model_name.replace(' ', '_').replace('(', '').replace(')', '').replace('=', '')}.png"
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.show()

print("[OK] Confusion matrices saved to: confusion_matrices/")


## Fichiers générés
- `eda_plots/` (histogrammes, boxplots, corrélation, pairplot)  
- `feature_importance_all.png`  
- `knn_elbow_method.png`  
- `model_comparison_results.csv`  
- `model_comparison.png`  
- `confusion_matrices/` (PNG)

In [None]:

# Sauvegarde du modèle Logistic Regression (joblib)
# Cette cellule permet d'enregistrer le modèle entraîné afin de le réutiliser ultérieurement

import joblib

MODEL_NAME = "Logistic Regression"

if MODEL_NAME not in models:
    raise ValueError("Le modèle Logistic Regression n'existe pas dans le dictionnaire models.")

logistic_model = models[MODEL_NAME]

model_filename = "logistic_regression_employee_attrition.pkl"
joblib.dump(logistic_model, model_filename)

print(f"[OK] Modèle Logistic Regression sauvegardé avec succès : {model_filename}")
