# HR Employee Attrition Analysis - Comprehensive Study

## Project Overview
This comprehensive analysis explores employee attrition patterns using advanced machine learning and statistical techniques to identify key factors influencing employee turnover and develop predictive models for HR decision-making.

**Key Objectives:**
1. Identify primary drivers of employee attrition
2. Build predictive models to identify at-risk employees
3. Provide actionable insights for HR retention strategies
4. Analyze demographic and job-related patterns

**Dataset:** 1,470 employees with 35 features including demographics, job characteristics, compensation, and work environment factors.

## 1. Import Libraries and Setup

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from scipy import stats

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import SMOTE

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("✅ Libraries imported successfully!")

## 2. Data Loading and Initial Exploration

In [None]:
# Load the dataset
print("🔄 Loading HR Employee Attrition dataset...")

df = pd.read_csv('HR-Employee-Attrition.csv')

print(f"✅ Dataset loaded successfully!")
print(f"📊 Dataset Shape: {df.shape}")
print(f"👥 Total Employees: {len(df)}")
print(f"📋 Features: {len(df.columns)}")

# Display basic information
print("\n=== DATASET OVERVIEW ===")
print(f"Columns: {list(df.columns)}")

print("\n=== DATA TYPES ===")
print(df.dtypes.value_counts())

print("\n=== MISSING VALUES ===")
missing_data = df.isnull().sum()
if missing_data.sum() == 0:
    print("✅ No missing values found!")
else:
    print(missing_data[missing_data > 0])

print("\n=== ATTRITION DISTRIBUTION ===")
attrition_counts = df['Attrition'].value_counts()
attrition_pct = df['Attrition'].value_counts(normalize=True) * 100
print(f"No Attrition: {attrition_counts['No']} ({attrition_pct['No']:.1f}%)")
print(f"Attrition: {attrition_counts['Yes']} ({attrition_pct['Yes']:.1f}%)")
print(f"Attrition Rate: {attrition_pct['Yes']:.1f}%")

# Display first few rows
print("\n=== SAMPLE DATA ===")
df.head()

In [None]:
# Comprehensive statistical summary
print("=== NUMERICAL FEATURES SUMMARY ===")
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical features ({len(numerical_features)}): {numerical_features}")
print("\nStatistical Summary:")
display(df[numerical_features].describe())

print("\n=== CATEGORICAL FEATURES SUMMARY ===")
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")
print("\nCategorical Summary:")
for col in categorical_features:
    print(f"\n{col}: {df[col].nunique()} unique values")
    print(df[col].value_counts().head())

## 3. Comprehensive Data Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

print("=== DATA PREPROCESSING ===")

# 1. Target variable encoding
le = LabelEncoder()
df_processed['Attrition_Target'] = le.fit_transform(df_processed['Attrition'])
print(f"✅ Target variable encoded: Yes=1, No=0")

# 2. Handle categorical variables
# Binary categorical variables
binary_cats = ['OverTime', 'Over18']
for col in binary_cats:
    if col in df_processed.columns:
        df_processed[col + '_encoded'] = le.fit_transform(df_processed[col])

# Multi-class categorical variables - One-hot encoding
categorical_to_encode = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 
                        'JobRole', 'MaritalStatus']
df_encoded = pd.get_dummies(df_processed, columns=categorical_to_encode, drop_first=True)

print(f"✅ Categorical variables encoded")
print(f"📊 New dataset shape: {df_encoded.shape}")

# 3. Feature engineering
print("\n🔧 Creating derived features...")

# Age groups
df_encoded['AgeGroup'] = pd.cut(df_encoded['Age'], 
                               bins=[0, 30, 40, 50, 100], 
                               labels=['Under_30', '30-40', '40-50', 'Over_50'])
df_encoded = pd.get_dummies(df_encoded, columns=['AgeGroup'], drop_first=True)

# Income groups
df_encoded['IncomeGroup'] = pd.cut(df_encoded['MonthlyIncome'], 
                                  bins=[0, 3000, 6000, 10000, 100000], 
                                  labels=['Low', 'Medium', 'High', 'Very_High'])
df_encoded = pd.get_dummies(df_encoded, columns=['IncomeGroup'], drop_first=True)

# Experience ratios
df_encoded['ExperienceRatio'] = df_encoded['YearsAtCompany'] / (df_encoded['TotalWorkingYears'] + 1)
df_encoded['PromotionRate'] = df_encoded['YearsSinceLastPromotion'] / (df_encoded['YearsAtCompany'] + 1)

# Work-life balance score
df_encoded['WorkLifeScore'] = (df_encoded['WorkLifeBalance'] + 
                              df_encoded['JobSatisfaction'] + 
                              df_encoded['EnvironmentSatisfaction']) / 3

print(f"✅ Feature engineering completed")
print(f"📊 Final dataset shape: {df_encoded.shape}")

# Remove redundant columns
columns_to_drop = ['Attrition', 'EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
df_final = df_encoded.drop(columns=[col for col in columns_to_drop if col in df_encoded.columns])

print(f"📊 Cleaned dataset shape: {df_final.shape}")
print("\n✅ Preprocessing completed successfully!")

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Comprehensive EDA with multiple visualizations
print("=== EXPLORATORY DATA ANALYSIS ===")

# Create figure with multiple subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Attrition distribution
attrition_counts = df['Attrition'].value_counts()
axes[0,0].pie(attrition_counts.values, labels=attrition_counts.index, autopct='%1.1f%%', 
              colors=['lightgreen', 'lightcoral'])
axes[0,0].set_title('Attrition Distribution', fontsize=14, fontweight='bold')

# 2. Age distribution by attrition
df.boxplot(column='Age', by='Attrition', ax=axes[0,1])
axes[0,1].set_title('Age Distribution by Attrition')
axes[0,1].set_xlabel('Attrition')

# 3. Monthly Income by attrition
df.boxplot(column='MonthlyIncome', by='Attrition', ax=axes[0,2])
axes[0,2].set_title('Monthly Income by Attrition')
axes[0,2].set_xlabel('Attrition')

# 4. Years at Company distribution
df.boxplot(column='YearsAtCompany', by='Attrition', ax=axes[1,0])
axes[1,0].set_title('Years at Company by Attrition')
axes[1,0].set_xlabel('Attrition')

# 5. Job Satisfaction by attrition
df.boxplot(column='JobSatisfaction', by='Attrition', ax=axes[1,1])
axes[1,1].set_title('Job Satisfaction by Attrition')
axes[1,1].set_xlabel('Attrition')

# 6. Distance from Home by attrition
df.boxplot(column='DistanceFromHome', by='Attrition', ax=axes[1,2])
axes[1,2].set_title('Distance from Home by Attrition')
axes[1,2].set_xlabel('Attrition')

plt.tight_layout()
plt.show()

# Statistical summaries by attrition
print("\n=== KEY METRICS BY ATTRITION STATUS ===")
key_metrics = ['Age', 'MonthlyIncome', 'YearsAtCompany', 'JobSatisfaction', 'DistanceFromHome']
for metric in key_metrics:
    no_attrition = df[df['Attrition'] == 'No'][metric].mean()
    yes_attrition = df[df['Attrition'] == 'Yes'][metric].mean()
    difference = no_attrition - yes_attrition
    print(f"{metric}:")
    print(f"  No Attrition: {no_attrition:.2f}")
    print(f"  Yes Attrition: {yes_attrition:.2f}")
    print(f"  Difference: {difference:.2f}\n")

In [None]:
# Categorical features analysis
print("=== CATEGORICAL FEATURES ANALYSIS ===")

fig, axes = plt.subplots(2, 3, figsize=(18, 12))

categorical_features = ['Department', 'JobRole', 'MaritalStatus', 'Gender', 'OverTime', 'BusinessTravel']

for i, feature in enumerate(categorical_features):
    row = i // 3
    col = i % 3
    
    # Cross-tabulation
    ct = pd.crosstab(df[feature], df['Attrition'], normalize='index')
    ct.plot(kind='bar', ax=axes[row, col], color=['lightgreen', 'lightcoral'])
    axes[row, col].set_title(f'Attrition Rate by {feature}')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Attrition Rate')
    axes[row, col].tick_params(axis='x', rotation=45)
    axes[row, col].legend(['No', 'Yes'])

plt.tight_layout()
plt.show()

# Print attrition rates by category
print("\n=== ATTRITION RATES BY CATEGORY ===")
for feature in categorical_features:
    print(f"\n{feature}:")
    attrition_rate = df.groupby(feature)['Attrition'].apply(lambda x: (x == 'Yes').sum() / len(x) * 100)
    for category, rate in attrition_rate.items():
        print(f"  {category}: {rate:.1f}%")

## 5. Correlation Analysis

In [None]:
# Comprehensive correlation analysis
print("=== CORRELATION ANALYSIS ===")

# Select numerical features for correlation
numerical_features = df_final.select_dtypes(include=[np.number]).columns.tolist()
corr_matrix = df_final[numerical_features].corr()

# Create correlation heatmap
plt.figure(figsize=(16, 12))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix - All Numerical Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Correlation with target variable
target_corr = df_final.corr()['Attrition_Target'].abs().sort_values(ascending=False)
print("\n=== TOP 15 FEATURES CORRELATED WITH ATTRITION ===")
for i, (feature, corr) in enumerate(target_corr.head(16).items()):
    if feature != 'Attrition_Target':
        print(f"{i:2d}. {feature:30s}: {corr:.3f}")

# Visualize top correlations
plt.figure(figsize=(12, 8))
top_features = target_corr.head(16)[1:]  # Exclude target itself
plt.barh(range(len(top_features)), top_features.values, color='skyblue')
plt.yticks(range(len(top_features)), top_features.index)
plt.xlabel('Absolute Correlation with Attrition')
plt.title('Top 15 Features Correlated with Attrition', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 6. Advanced Machine Learning Pipeline

In [None]:
# Prepare data for machine learning
print("=== MACHINE LEARNING PIPELINE ===")

# Select features (exclude target and string columns)
feature_columns = df_final.select_dtypes(include=[np.number]).columns.tolist()
feature_columns.remove('Attrition_Target')

X = df_final[feature_columns]
y = df_final['Attrition_Target']

print(f"📊 Features selected: {len(feature_columns)}")
print(f"📊 Samples: {len(X)}")
print(f"📊 Target distribution: {y.value_counts().to_dict()}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"📊 Training set: {X_train.shape}")
print(f"📊 Test set: {X_test.shape}")

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
X_train_scaled_balanced = scaler.fit_transform(X_train_balanced)

print(f"📊 Balanced training set: {X_train_balanced.shape}")
print(f"📊 Balanced target distribution: {pd.Series(y_train_balanced).value_counts().to_dict()}")

print("\n✅ Data preparation completed!")

In [None]:
# Define and train multiple models
print("=== MODEL TRAINING AND EVALUATION ===")

# Define models
models = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': [0.1, 1.0, 10.0],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear']
        },
        'scaled': True
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        },
        'scaled': False
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        },
        'scaled': False
    },
    'Support Vector Machine': {
        'model': SVC(random_state=42, probability=True),
        'params': {
            'C': [0.1, 1.0, 10.0],
            'gamma': ['scale', 'auto'],
            'kernel': ['rbf', 'linear']
        },
        'scaled': True
    }
}

# Train and evaluate models
results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model_config in models.items():
    print(f"\n🤖 Training {name}...")
    
    # Select appropriate data
    if model_config['scaled']:
        X_train_use = X_train_scaled_balanced
        X_test_use = X_test_scaled
    else:
        X_train_use = X_train_balanced
        X_test_use = X_test
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(
        model_config['model'],
        model_config['params'],
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1
    )
    
    grid_search.fit(X_train_use, y_train_balanced)
    best_model = grid_search.best_estimator_
    
    # Cross-validation scores
    cv_scores = cross_val_score(best_model, X_train_use, y_train_balanced, cv=cv, scoring='roc_auc')
    
    # Test predictions
    y_pred = best_model.predict(X_test_use)
    y_pred_proba = best_model.predict_proba(X_test_use)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    results[name] = {
        'model': best_model,
        'best_params': grid_search.best_params_,
        'cv_auc_mean': cv_scores.mean(),
        'cv_auc_std': cv_scores.std(),
        'test_accuracy': accuracy,
        'test_auc': auc_score,
        'predictions': y_pred,
        'predictions_proba': y_pred_proba
    }
    
    print(f"  ✅ Best parameters: {grid_search.best_params_}")
    print(f"  📊 CV AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print(f"  📊 Test Accuracy: {accuracy:.4f}")
    print(f"  📊 Test AUC: {auc_score:.4f}")

print("\n✅ Model training completed!")

## 7. Model Evaluation and Comparison

In [None]:
# Comprehensive model comparison
print("=== MODEL COMPARISON ===")

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': results.keys(),
    'CV AUC': [results[name]['cv_auc_mean'] for name in results.keys()],
    'CV AUC Std': [results[name]['cv_auc_std'] for name in results.keys()],
    'Test Accuracy': [results[name]['test_accuracy'] for name in results.keys()],
    'Test AUC': [results[name]['test_auc'] for name in results.keys()]
})

comparison_df = comparison_df.sort_values('Test AUC', ascending=False)
print("\n📊 Model Performance Summary:")
print(comparison_df.round(4))

# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# AUC comparison
model_names = list(results.keys())
cv_aucs = [results[name]['cv_auc_mean'] for name in model_names]
test_aucs = [results[name]['test_auc'] for name in model_names]

x = np.arange(len(model_names))
width = 0.35

axes[0,0].bar(x - width/2, cv_aucs, width, label='CV AUC', alpha=0.8)
axes[0,0].bar(x + width/2, test_aucs, width, label='Test AUC', alpha=0.8)
axes[0,0].set_xlabel('Models')
axes[0,0].set_ylabel('AUC Score')
axes[0,0].set_title('Model Performance Comparison (AUC)')
axes[0,0].set_xticks(x)
axes[0,0].set_xticklabels(model_names, rotation=45, ha='right')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Find best model
best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

print(f"\n🏆 Best Model: {best_model_name}")
print(f"📊 Best Test AUC: {results[best_model_name]['test_auc']:.4f}")
print(f"📊 Best Test Accuracy: {results[best_model_name]['test_accuracy']:.4f}")

# Confusion Matrix for best model
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0,1])
axes[0,1].set_title(f'Confusion Matrix - {best_model_name}')
axes[0,1].set_ylabel('True Label')
axes[0,1].set_xlabel('Predicted Label')

# ROC Curves
for name in results.keys():
    fpr, tpr, _ = roc_curve(y_test, results[name]['predictions_proba'])
    auc_score = results[name]['test_auc']
    axes[1,0].plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.3f})')

axes[1,0].plot([0, 1], [0, 1], 'k--', label='Random Classifier')
axes[1,0].set_xlabel('False Positive Rate')
axes[1,0].set_ylabel('True Positive Rate')
axes[1,0].set_title('ROC Curves Comparison')
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# Feature importance for best model (if available)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False).head(15)
    
    axes[1,1].barh(range(len(feature_importance)), feature_importance['importance'])
    axes[1,1].set_yticks(range(len(feature_importance)))
    axes[1,1].set_yticklabels(feature_importance['feature'])
    axes[1,1].set_xlabel('Feature Importance')
    axes[1,1].set_title(f'Top 15 Feature Importances - {best_model_name}')
    axes[1,1].invert_yaxis()

plt.tight_layout()
plt.show()

# Detailed classification report for best model
print(f"\n📋 Detailed Classification Report - {best_model_name}:")
print(classification_report(y_test, best_predictions, target_names=['No Attrition', 'Attrition']))

## 8. Feature Importance Analysis

In [None]:
# Detailed feature importance analysis
print("=== FEATURE IMPORTANCE ANALYSIS ===")

if hasattr(best_model, 'feature_importances_'):
    # Feature importance from best model
    feature_importance_df = pd.DataFrame({
        'Feature': feature_columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print(f"\n🔍 Top 20 Most Important Features ({best_model_name}):")
    for i, (_, row) in enumerate(feature_importance_df.head(20).iterrows()):
        print(f"{i+1:2d}. {row['Feature']:30s}: {row['Importance']:.4f}")
    
    # Visualize feature importance
    plt.figure(figsize=(12, 10))
    top_features = feature_importance_df.head(20)
    plt.barh(range(len(top_features)), top_features['Importance'], color='skyblue')
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Feature Importances - {best_model_name}', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    # Cumulative importance
    cumulative_importance = feature_importance_df['Importance'].cumsum()
    n_features_80 = (cumulative_importance <= 0.8).sum() + 1
    n_features_90 = (cumulative_importance <= 0.9).sum() + 1
    
    print(f"\n📊 Feature Selection Insights:")
    print(f"   Features for 80% importance: {n_features_80}")
    print(f"   Features for 90% importance: {n_features_90}")
    print(f"   Total features: {len(feature_columns)}")

# Top features for business interpretation
print(f"\n💼 Business Insights from Top Features:")
top_business_features = feature_importance_df.head(10)
for _, row in top_business_features.iterrows():
    feature = row['Feature']
    importance = row['Importance']
    
    if 'OverTime' in feature:
        print(f"   🔸 {feature} ({importance:.3f}): Overtime work significantly impacts attrition")
    elif 'MonthlyIncome' in feature:
        print(f"   🔸 {feature} ({importance:.3f}): Compensation levels are crucial for retention")
    elif 'JobSatisfaction' in feature:
        print(f"   🔸 {feature} ({importance:.3f}): Job satisfaction directly affects turnover")
    elif 'Age' in feature:
        print(f"   🔸 {feature} ({importance:.3f}): Age demographics influence retention patterns")
    elif 'Years' in feature:
        print(f"   🔸 {feature} ({importance:.3f}): Experience and tenure are key retention factors")
    elif 'Distance' in feature:
        print(f"   🔸 {feature} ({importance:.3f}): Commute distance affects employee satisfaction")
    else:
        print(f"   🔸 {feature} ({importance:.3f}): Important factor for attrition prediction")

## 9. Business Insights and Recommendations

In [None]:
# Generate comprehensive business insights
print("=== BUSINESS INSIGHTS AND RECOMMENDATIONS ===")

# Calculate key statistics
total_employees = len(df)
attrition_count = len(df[df['Attrition'] == 'Yes'])
attrition_rate = (attrition_count / total_employees) * 100

print(f"\n📊 EXECUTIVE SUMMARY:")
print(f"   Total Employees Analyzed: {total_employees:,}")
print(f"   Employees with Attrition: {attrition_count:,}")
print(f"   Overall Attrition Rate: {attrition_rate:.1f}%")
print(f"   Best Predictive Model: {best_model_name}")
print(f"   Model Accuracy: {results[best_model_name]['test_accuracy']:.1%}")
print(f"   Model AUC Score: {results[best_model_name]['test_auc']:.3f}")

# Key risk factors analysis
print(f"\n🚨 KEY RISK FACTORS IDENTIFIED:")

# Overtime analysis
overtime_attrition = df[df['OverTime'] == 'Yes']['Attrition'].value_counts(normalize=True)['Yes'] * 100
no_overtime_attrition = df[df['OverTime'] == 'No']['Attrition'].value_counts(normalize=True)['Yes'] * 100
print(f"   1. OVERTIME WORK:")
print(f"      - Attrition rate with overtime: {overtime_attrition:.1f}%")
print(f"      - Attrition rate without overtime: {no_overtime_attrition:.1f}%")
print(f"      - Risk multiplier: {overtime_attrition/no_overtime_attrition:.1f}x higher")

# Age analysis
young_employees = df[df['Age'] < 30]
young_attrition = (young_employees['Attrition'] == 'Yes').mean() * 100
mature_employees = df[df['Age'] >= 30]
mature_attrition = (mature_employees['Attrition'] == 'Yes').mean() * 100
print(f"   2. AGE DEMOGRAPHICS:")
print(f"      - Young employees (<30): {young_attrition:.1f}% attrition rate")
print(f"      - Mature employees (30+): {mature_attrition:.1f}% attrition rate")
print(f"      - Young employees are {young_attrition/mature_attrition:.1f}x more likely to leave")

# Income analysis
low_income = df[df['MonthlyIncome'] < df['MonthlyIncome'].median()]
high_income = df[df['MonthlyIncome'] >= df['MonthlyIncome'].median()]
low_income_attrition = (low_income['Attrition'] == 'Yes').mean() * 100
high_income_attrition = (high_income['Attrition'] == 'Yes').mean() * 100
print(f"   3. COMPENSATION LEVEL:")
print(f"      - Below median income: {low_income_attrition:.1f}% attrition rate")
print(f"      - Above median income: {high_income_attrition:.1f}% attrition rate")
print(f"      - Lower income employees are {low_income_attrition/high_income_attrition:.1f}x more likely to leave")

# Department analysis
print(f"   4. DEPARTMENT-WISE RISK:")
dept_attrition = df.groupby('Department')['Attrition'].apply(lambda x: (x == 'Yes').mean() * 100).sort_values(ascending=False)
for dept, rate in dept_attrition.items():
    print(f"      - {dept}: {rate:.1f}% attrition rate")

print(f"\n💡 STRATEGIC RECOMMENDATIONS:")
print(f"   1. IMMEDIATE ACTIONS (0-3 months):")
print(f"      🔸 Implement overtime monitoring and approval system")
print(f"      🔸 Conduct exit interviews for departing employees")
print(f"      🔸 Review compensation packages for below-median earners")
print(f"      🔸 Deploy predictive model to identify at-risk employees")

print(f"   2. SHORT-TERM INITIATIVES (3-6 months):")
print(f"      🔸 Develop career development programs for young employees")
print(f"      🔸 Implement flexible work arrangements to reduce overtime")
print(f"      🔸 Create mentorship programs for early-career staff")
print(f"      🔸 Enhance job satisfaction surveys and action plans")

print(f"   3. LONG-TERM STRATEGY (6+ months):")
print(f"      🔸 Redesign roles to improve work-life balance")
print(f"      🔸 Implement performance-based retention bonuses")
print(f"      🔸 Develop internal promotion pathways")
print(f"      🔸 Create department-specific retention strategies")

print(f"\n📈 EXPECTED IMPACT:")
potential_reduction = 25  # Estimated percentage reduction in attrition
potential_savings = attrition_count * potential_reduction / 100
print(f"   Potential attrition reduction: {potential_reduction}%")
print(f"   Employees retained annually: ~{potential_savings:.0f}")
print(f"   Estimated cost savings: Significant (recruitment, training, knowledge loss)")

print(f"\n🎯 SUCCESS METRICS:")
print(f"   📊 Monthly attrition rate tracking")
print(f"   📊 Overtime hours per employee")
print(f"   📊 Job satisfaction survey scores")
print(f"   📊 Early-career employee retention rate")
print(f"   📊 Department-wise retention improvements")

print("\n" + "="*80)
print("HR ATTRITION ANALYSIS COMPLETED SUCCESSFULLY!")
print("This analysis provides data-driven insights for strategic HR decision-making.")
print("="*80)