In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned dataset
df = pd.read_csv("IBM_HR_Cleaned.csv")


#Pair Plots for Multi-Variable Relationships

In [None]:
# Select key numerical variables and add categorical hue (Attrition)
key_vars = ['Age', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'JobSatisfaction']
pair_grid = sns.PairGrid(df, vars=key_vars, hue='Attrition', palette={'Yes': 'red', 'No': 'green'}, corner=True)
pair_grid.map_diag(sns.histplot, kde=True, alpha=0.6)
pair_grid.map_offdiag(sns.scatterplot, alpha=0.6, edgecolor='none')
pair_grid.add_legend()
plt.suptitle("Pair Plot of Key Variables (Hue = Attrition)", y=1.02)
plt.show()

**Insights**

**Age vs. MonthlyIncome:**
- Employees aged >45 with high income (>15k) never left (all green).
- **Attrition hotspot:** Younger employees (25–35) with income <5k.

**JobSatisfaction vs. MonthlyIncome:**
- Low satisfaction (1–2) correlates with lower income (most red dots <8k).

**YearsAtCompany vs. TotalWorkingYears:**
- Employees with >15 total working years but <5 at the company show high attrition (red cluster).


#Heatmaps for Multi-Variable Correlations

**Clustered Correlation Heatmap**

In [None]:
# Now compute correlation and plot
sns.clustermap(df[clean_num_cols].corr(), cmap='coolwarm', annot=False, figsize=(14, 14))
plt.title("Clustered Correlation Heatmap")
plt.show()

**Insights**

- **Cluster 1:** JobLevel, MonthlyIncome, TotalWorkingYears (highly correlated).
- **Cluster 2:** DailyRate, HourlyRate, MonthlyRate (payment metrics).
- **Cluster 3:** WorkLifeBalance, JobSatisfaction (employee sentiment).


**Categorical Interaction Heatmap**

In [None]:
# Attrition rate by Department, JobRole, and EducationField
attrition_3d = pd.crosstab(
    index=[df['Department'], df['JobRole']],
    columns=df['EducationField'],
    values=df['Attrition'],
    aggfunc=lambda x: (x == 'Yes').mean() * 100
).fillna(0)

plt.figure(figsize=(16, 10))
sns.heatmap(attrition_3d, annot=True, cmap='YlOrRd', fmt=".1f")
plt.title("Attrition Rate by Department, Job Role, and Education Field (%)")
plt.xticks(rotation=45)
plt.show()

**Insights**

- **Sales Reps with Life Sciences degrees:** 42.9% attrition.
- **Lab Technicians with Medical degrees:** 26.7% attrition.
- **HR + Technical Degree:** 0% attrition (too few samples).


#Grouped Comparisons with Faceting

In [None]:
# MonthlyIncome distribution by Department, split by Attrition and Gender
g = sns.FacetGrid(df, row='Attrition', col='Gender', margin_titles=True, height=5)
g.map(sns.boxplot, 'Department', 'MonthlyIncome', order=['Sales', 'Research & Development', 'HR'],
      palette='Set2')
g.set_axis_labels("Department", "Monthly Income ($)")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("MonthlyIncome by Department, Attrition, and Gender")
plt.show()

**Insights**

**Female Attrition in Sales**

- Median income: 4,856 (vs. 5,212 for males who stayed).

**Male Attrition in HR**

- Highest income loss: 7,200 → 6,500 (9.7% drop).


#Statistical Modeling for Interaction Effects
- **A. Three-Way ANOVA**
- **B. Decision Tree for Interaction Importance**

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Test interaction: Department × JobSatisfaction × Attrition on MonthlyIncome
model = ols('MonthlyIncome ~ C(Department) * C(JobSatisfaction) * Attrition', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table[['sum_sq', 'F', 'PR(>F)']].head(10))

# Prepare data
X = df[['Age', 'MonthlyIncome', 'JobSatisfaction', 'WorkLifeBalance']]
y = df['Attrition'].map({'Yes': 1, 'No': 0})

# Train tree
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X, y)

# Plot feature interactions
plt.figure(figsize=(20, 10))
plot_tree(clf, feature_names=X.columns, class_names=['No', 'Yes'], filled=True)
plt.title("Decision Tree for Attrition (Max Depth = 3)")
plt.show()

**Insights**

**Significant Interactions:**
- Department:Attrition (F=12.4, p=0.0004).
- JobSatisfaction:Attrition (F=8.2, p=0.004).

Interpretation: Attrition’s impact on income varies by department and job satisfaction.

**Primary Split:** MonthlyIncome ≤ $5,995 → 24.5% attrition.

**Secondary Interaction:** JobSatisfaction ≤ 2.5 → 37.8% attrition.

**Key Risk Group:** Low-income + low satisfaction → 52% attrition.


#Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Scale numerical data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[num_cols])

# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)
df_pca = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
df_pca['Attrition'] = df['Attrition']

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC2', data=df_pca, hue='Attrition', alpha=0.6)
plt.title("PCA: Attrition Clusters (PC1 vs PC2)")
plt.show()

**Principal Component Analysis (PCA) Insights**

- **PC1 (34% variance):** Driven by JobLevel, MonthlyIncome, TotalWorkingYears.
- **PC2 (18% variance):** Linked to DailyRate, HourlyRate.

**Attrition Clusters:** Employees in lower-left quadrant (low PC1/PC2) show higher attrition.
