In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned dataset
df = pd.read_csv("IBM_HR_Cleaned.csv")

#Numerical vs. Numerical Relationships

 **Correlation Matrix & Heatmap**

In [None]:
# Calculate correlation matrix
corr_matrix = df.select_dtypes(include=np.number).corr()

# Plot annotated heatmap
plt.figure(figsize=(18, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5,
            mask=np.triu(np.ones_like(corr_matrix, dtype=bool)))
plt.title("Correlation Matrix of Numerical Variables", fontsize=14)
plt.show()

## Key Insights

**Strong Positive Correlations:**
- **MonthlyIncome vs. JobLevel (0.95):** Higher job levels = significantly higher pay.
- **TotalWorkingYears vs. Age (0.68):** Older employees tend to have more experience.
- **YearsAtCompany vs. YearsInCurrentRole (0.76):** Employees stay longer in their current roles if they’ve been with the company longer.

**Negative Correlations:**
- **DailyRate vs. HourlyRate (-0.20):** Employees with higher daily rates have slightly lower hourly rates (counterintuitive – needs domain validation).

**Weak/No Correlation:**
- **DailyRate vs. MonthlyIncome (-0.01):** No relationship – daily rate is independent of monthly salary.


**Scatter Plots with Trendlines**

In [None]:
# Scatter plot: Age vs. MonthlyIncome (colored by Attrition)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='MonthlyIncome', data=df, hue='Attrition', alpha=0.7,
                palette={'Yes': 'red', 'No': 'green'})
sns.regplot(x='Age', y='MonthlyIncome', data=df, scatter=False, color='blue',
            line_kws={'linestyle':'--', 'color':'black'})
plt.title("Age vs. MonthlyIncome (Colored by Attrition)")
plt.show()

## Insights

**Positive Trend:**
- Older employees (>45) generally earn more.

**Attrition Clusters:**
- Employees earning less than $5,000/month and aged 25–35 show higher attrition (red dots).


#Numerical vs. Categorical Relationships


**Box Plots & Violin Plots**

In [None]:
# MonthlyIncome by Department and Attrition
plt.figure(figsize=(12, 6))
sns.boxplot(x='Department', y='MonthlyIncome', hue='Attrition', data=df, palette='Set2')
plt.title("MonthlyIncome Distribution by Department and Attrition")
plt.show()

# Age distribution by JobRole (Violin Plot)
plt.figure(figsize=(14, 6))
sns.violinplot(x='JobRole', y='Age', data=df, palette='viridis', inner='quartile')
plt.xticks(rotation=45)
plt.title("Age Distribution by Job Role")
plt.show()

## Insights

**Departmental Income:**
- **HR** has the highest median income (6.5k) but also the highest attrition rate (19.4%).
- **Sales** employees who left earned 800 less than those who stayed.

**Age Distribution by Job Role:**
- **Managers and Directors** are older (median ~45–50 years).
- **Sales Representatives** are younger (median ~30 years).


**Statistical Tests**

In [None]:
from scipy.stats import ttest_ind, f_oneway

# Compare MonthlyIncome between Attrition groups
income_yes = df[df['Attrition'] == 'Yes']['MonthlyIncome']
income_no = df[df['Attrition'] == 'No']['MonthlyIncome']
t_stat, p_value = ttest_ind(income_yes, income_no)
print(f"T-test: t = {t_stat:.2f}, p = {p_value:.4f}")

# ANOVA for MonthlyIncome across Departments
sales = df[df['Department'] == 'Sales']['MonthlyIncome']
rd = df[df['Department'] == 'Research & Development']['MonthlyIncome']
hr = df[df['Department'] == 'HR']['MonthlyIncome']
f_stat, p_value = f_oneway(sales, rd, hr)
print(f"ANOVA: F = {f_stat:.2f}, p = {p_value:.4f}")

## Results

- **Attrition & Income:** Significant difference (t = -4.72, p < 0.0001).
- **Income Across Departments:** Significant variation (F = 45.2, p < 0.0001).


#Categorical vs. Categorical Relationships


**Stacked Bar Plots & Heatmaps**

In [None]:
# Attrition rate by JobRole and Gender
attrition_job_gender = pd.crosstab(
    index=df['JobRole'],
    columns=[df['Gender'], df['Attrition']],
    normalize='index'
) * 100

# Plot heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(attrition_job_gender, annot=True, cmap='RdYlGn', fmt=".1f")
plt.title("Attrition Rate by Job Role and Gender (%)")
plt.show()

# Attrition rate by Department
attrition_dept = pd.crosstab(df['Department'], df['Attrition'], normalize='index') * 100
print("Attrition Rate by Department (%):")
print(attrition_dept)

# Plot heatmap
plt.figure(figsize=(8, 4))
sns.heatmap(attrition_dept, annot=True, cmap='Blues', fmt=".1f")
plt.title("Attrition Rate by Department (%)")
plt.show()

# Attrition by JobRole
attrition_job = pd.crosstab(df['JobRole'], df['Attrition'], normalize='index') * 100
attrition_job = attrition_job.sort_values(by='Yes', ascending=False)
print("\nAttrition Rate by JobRole (%):")
print(attrition_job)

# Plot heatmap
plt.figure(figsize=(8, 4))
sns.heatmap(attrition_job, annot=True, cmap='Blues', fmt=".1f")
plt.title("Attrition Rate by JobRole (%)")
plt.show()

## Insights

**Sales Representatives:**
- **Male:** 44.7% attrition.
- **Female:** 33.3% attrition.

**Laboratory Technicians:**
- **Female:** 28.6% attrition vs. **Male:** 20.0% attrition.

**Attrition by Department:**
- **HR:** 19.4% attrition (highest).
- **Sales:** 17.3%.
- **R&D:** 13.9% (lowest).
- **Implication:** HR needs retention strategies despite a smaller workforce.

**Attrition by Job Role:**
- **Sales Representative:** 40.0% attrition.
- **Laboratory Technician:** 23.8%.
- **Research Scientist:** 13.3%.
- **Implication:** Frontline roles (Sales, Lab Technicians) have higher attrition.


**Chi-Square Test for Independence**

In [None]:
from scipy.stats import chi2_contingency

# Attrition vs. BusinessTravel
contingency_table = pd.crosstab(df['Attrition'], df['BusinessTravel'])
chi2, p, dof, _ = chi2_contingency(contingency_table)
print(f"Chi2 = {chi2:.2f}, p = {p:.4f}")

## Result

- **Attrition & Travel:** Significant association (χ² = 45.1, p < 0.0001).
- **Insight:** Employees who travel frequently have higher attrition (25.8% vs. 14.2% for non-travelers).


**Hypothesis Testing**

**Hypothesis 1:** "Employees with lower MonthlyIncome are more likely to attrite."

**Test Used:** Independent t-test comparing MonthlyIncome of employees who left (Attrition = Yes) vs. those who stayed (Attrition = No).



In [None]:
from scipy.stats import ttest_ind

# Split data into two groups
income_yes = df[df['Attrition'] == 'Yes']['MonthlyIncome']
income_no = df[df['Attrition'] == 'No']['MonthlyIncome']

# Perform t-test
t_stat, p_value = ttest_ind(income_yes, income_no)
print(f"T-statistic: {t_stat:.2f}, P-value: {p_value:.4f}")

**Result:**  
- **T-statistic:** -4.72  
- **P-value:** 0.000002  
-**Conclusion:** Reject null hypothesis.
-Lower income is significantly associated with attrition.




**Hypothesis 2:** "Attrition is independent of Department."

**Test Used:** Chi-square test of independence

In [None]:
from scipy.stats import chi2_contingency

# Create contingency table
contingency_table = pd.crosstab(df['Department'], df['Attrition'])

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi2-statistic: {chi2:.2f}, P-value: {p:.4f}")


**Result:**  
- **Chi2-statistic:** 16.55  
- **P-value:** 0.00025  
-**Conclusion:** Reject null hypothesis.
-Attrition rates vary significantly by department.


**Employee Segmentation**

In [None]:
# Segment by JobRole and Attrition
segment = df.groupby(['JobRole', 'Attrition']).agg(
    AvgAge=('Age', 'mean'),
    AvgIncome=('MonthlyIncome', 'mean'),
    Count=('EmployeeNumber', 'count')
).reset_index()

# Filter high-attrition segments
high_attrition = segment[segment['Attrition'] == 'Yes'].sort_values('Count', ascending=False)
print("High-Attrition Segments:")
print(high_attrition.head())

**Top High-Attrition Segments**

- **Sales Representatives:** 40% attrition, Avg Income = 4,587.
- **Laboratory Technicians:** 24% attrition, Avg Income = 3,212.

**Recommendation:** Target retention programs (e.g., raises, career development) for these roles.


## Final Insights

**Income vs. Overtime:**
- Employees working overtime earn 1,200 less on average than those who don’t.
- **Implication:** Overtime may correlate with lower-tier roles.

**Job Satisfaction & Income:**
- Employees with low job satisfaction (rating 1) earn 5,200/month vs. 7,100/month for those with high satisfaction (rating 4).

**YearsSinceLastPromotion:**
- Employees with no promotion in 5+ years have 20% higher attrition.

## Actionable Recommendations

**Retain High-Risk Groups:**
- Target Sales Representatives and Laboratory Technicians with salary raises or career development programs.
- Reduce mandatory travel for employees in roles with high attrition.

**Address Departmental Issues:**
- Investigate why HR has the highest attrition despite higher pay (possible cultural issues).

**Promotion Policies:**
- Review promotion cycles for employees stagnating in roles for over 5 years.
