# IBM HR Analytics: Employee Attrition Analysis

## Complete Data Science Analysis with Visualizations

This notebook provides a comprehensive analysis of employee attrition using the IBM HR Analytics dataset.

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Set styling
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Load the data
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
print(f'Dataset shape: {df.shape}')
print(f'\nFirst few rows:')
df.head()

## 2. Exploratory Data Analysis

In [None]:
print('Dataset Info:')
print(df.info())
print('\nMissing Values:')
print(df.isnull().sum())
print('\nAttrition Distribution:')
print(df['Attrition'].value_counts())
attrition_rate = (df['Attrition'].value_counts()['Yes'] / len(df)) * 100
print(f'\nAttrition Rate: {attrition_rate:.2f}%')

## 3. Visualizations

In [None]:
# Attrition by Department
fig, ax = plt.subplots(figsize=(10, 6))
sns.countplot(data=df, x='Department', hue='Attrition', ax=ax)
ax.set_title('Employee Attrition by Department', fontsize=14, fontweight='bold')
ax.set_xlabel('Department')
ax.set_ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Age Distribution by Attrition
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(data=df, x='Age', hue='Attrition', kde=True, bins=30, ax=ax)
ax.set_title('Age Distribution by Attrition Status', fontsize=14, fontweight='bold')
ax.set_xlabel('Age')
ax.set_ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Monthly Income by Attrition
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df, x='Attrition', y='MonthlyIncome', ax=ax)
ax.set_title('Monthly Income by Attrition Status', fontsize=14, fontweight='bold')
ax.set_ylabel('Monthly Income ($)')
plt.tight_layout()
plt.show()

In [None]:
# Job Satisfaction vs Attrition
fig, ax = plt.subplots(figsize=(10, 6))
job_satisfaction = pd.crosstab(df['JobSatisfaction'], df['Attrition'])
job_satisfaction.plot(kind='bar', ax=ax)
ax.set_title('Job Satisfaction Level vs Attrition', fontsize=14, fontweight='bold')
ax.set_xlabel('Job Satisfaction Level')
ax.set_ylabel('Count')
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Work-Life Balance vs Attrition
fig, ax = plt.subplots(figsize=(10, 6))
wlb = pd.crosstab(df['WorkLifeBalance'], df['Attrition'])
wlb.plot(kind='bar', ax=ax)
ax.set_title('Work-Life Balance vs Attrition', fontsize=14, fontweight='bold')
ax.set_xlabel('Work-Life Balance Rating')
ax.set_ylabel('Count')
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Years at Company vs Attrition
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df, x='Attrition', y='YearsAtCompany', ax=ax)
ax.set_title('Years at Company by Attrition Status', fontsize=14, fontweight='bold')
ax.set_ylabel('Years at Company')
plt.tight_layout()
plt.show()

In [None]:
# Overtime vs Attrition
fig, ax = plt.subplots(figsize=(10, 6))
overtime = pd.crosstab(df['OverTime'], df['Attrition'])
overtime.plot(kind='bar', ax=ax)
ax.set_title('Overtime vs Attrition', fontsize=14, fontweight='bold')
ax.set_xlabel('Works Overtime')
ax.set_ylabel('Count')
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Correlation Heatmap
fig, ax = plt.subplots(figsize=(14, 10))
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation = df[numeric_cols].corr()
sns.heatmap(correlation, annot=False, cmap='coolwarm', center=0, ax=ax)
ax.set_title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Feature Engineering and Model Training

In [None]:
# Prepare data for modeling
df_model = df.copy()
df_model['Attrition'] = df_model['Attrition'].map({'Yes': 1, 'No': 0})

# Encode categorical variables
categorical_cols = df_model.select_dtypes(include=['object']).columns
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    le_dict[col] = le

# Remove unnecessary columns
cols_to_drop = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
cols_to_drop = [col for col in cols_to_drop if col in df_model.columns]
df_model = df_model.drop(columns=cols_to_drop)

# Separate features and target
X = df_model.drop('Attrition', axis=1)
y = df_model['Attrition']

print(f'Features shape: {X.shape}')
print(f'Target distribution:\n{y.value_counts()}')

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Training set size: {len(X_train)}')
print(f'Test set size: {len(X_test)}')

In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf_model.predict(X_test_scaled)
y_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC-AUC Score: {roc_auc:.4f}')
print(f'\nClassification Report:')
print(classification_report(y_test, y_pred))

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(feature_importance['feature'][:15], feature_importance['importance'][:15])
ax.set_xlabel('Importance')
ax.set_title('Top 15 Most Important Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print('Top 10 Features:')
print(feature_importance.head(10))

## 5. Business Insights and Recommendations

In [None]:
print('=== KEY INSIGHTS ===\n')
print(f'Overall Attrition Rate: {attrition_rate:.2f}%')
print(f'\nAverage Age: {df["Age"].mean():.2f} years')
print(f'Average Monthly Income: ${df["MonthlyIncome"].mean():.2f}')
print(f'Average Years at Company: {df["YearsAtCompany"].mean():.2f} years')
print(f'\nHigh Attrition Job Roles:')
attrition_by_role = df[df['Attrition']=='Yes']['JobRole'].value_counts()
for role, count in attrition_by_role.head().items():
    print(f'  - {role}: {count} employees')

In [None]:
print('\n=== RECOMMENDATIONS ===\n')
recommendations = [
    '1. Focus on employee work-life balance and job satisfaction programs',
    '2. Review compensation packages for competitive alignment, especially for high-attrition roles',
    '3. Implement targeted retention programs for employees working overtime',
    '4. Conduct regular engagement surveys and feedback sessions',
    '5. Provide clear career development pathways and growth opportunities',
    '6. Monitor and optimize remote work policies and flexible arrangements',
    '7. Invest in manager training to improve employee relationships'
]
for rec in recommendations:
    print(rec)