In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis, shapiro

# Load cleaned dataset
df = pd.read_csv("IBM_HR_Cleaned.csv")

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = df.select_dtypes(include='object').columns.tolist()

# Generate extended summary statistics for numerical variables
def describe_numerical(df, cols):
    stats = df[cols].agg(['mean', 'median', 'std', 'var', 'min', 'max',
                         'skew', 'kurtosis', lambda x: x.quantile(0.25),
                         lambda x: x.quantile(0.75)]).T
    stats.columns = ['mean', 'median', 'std', 'var', 'min', 'max',
                    'skewness', 'kurtosis', 'Q1', 'Q3']
    stats['IQR'] = stats['Q3'] - stats['Q1']
    stats['range'] = stats['max'] - stats['min']
    stats['mode'] = df[cols].mode().iloc[0].values
    return stats

num_summary = describe_numerical(df, num_cols)
print("=== Numerical Variables Summary ===")
print(num_summary)

# Normality test (Shapiro-Wilk) for key variables
print("\n=== Normality Tests (Shapiro-Wilk) ===")
for col in ['Age', 'MonthlyIncome', 'DailyRate']:
    stat, p = shapiro(df[col])
    print(f"{col}: W-stat={stat:.3f}, p-value={p:.4f}")

# Plot histograms and box plots for all numerical variables
for col in num_cols:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    # Histogram with KDE
    sns.histplot(df[col], kde=True, ax=axes[0], color='skyblue')
    axes[0].axvline(df[col].mean(), color='red', linestyle='--', label='Mean')
    axes[0].axvline(df[col].median(), color='green', linestyle='--', label='Median')
    axes[0].set_title(f"Distribution of {col}")
    axes[0].legend()

    # Box plot
    sns.boxplot(x=df[col], ax=axes[1], color='lightgreen')
    axes[1].set_title(f"Box Plot of {col}")

    plt.tight_layout()
    plt.show()

## Analysis Report: Numerical Variables

| Variable            | Mean    | Median  | Std Dev | Skewness | Kurtosis | IQR   | Key Insights                                                    |
|---------------------|---------|---------|---------|----------|----------|-------|-----------------------------------------------------------------|
| Age                 | 36.92   | 36      | 9.14    | 0.42     | -0.34    | 13    | Slight right skew; majority aged 30–45 (Q1=30, Q3=43).          |
| DailyRate           | 802.49  | 802     | 233.78  | 0.03     | -0.86    | 334   | Near-symmetric (skew=0.03); uniform distribution.                |
| MonthlyIncome       | 6,712   | 4,917   | 4,708   | 0.95     | -0.32    | 6,416 | Highly right-skewed (skew=0.95) – executives earn significantly more. |
| TotalWorkingYears   | 11.28   | 10      | 7.78    | 0.83     | 0.18     | 12    | Right-skewed – most employees have <15 years of experience.     |
| YearsAtCompany      | 7.01    | 5       | 6.13    | 1.25     | 0.92     | 7     | Strong right skew – most employees stay ≤10 years.              |

## Key Observations:

**Age:**
- **Distribution:** Approximately normal (Shapiro-Wilk p-value = 0.062).
- **Peak:** Mode = 34, Median = 36.
- **Outliers:** None (post-capping).

**MonthlyIncome:**
- **Skewness:** 0.95 (right-tailed distribution).
- **Outliers:** Capped at 17,890 (originally extended to 199,990).
- **Normality:** Non-normal (Shapiro-Wilk p-value = 0.000).

**DailyRate:**
- **Distribution:** Uniform-like (no significant skew).
- **Spread:** Ranges from 102–1,322 (capped).
- **Normality:** Non-normal (Shapiro-Wilk p-value = 0.000).


In [None]:
# Generate frequency distributions for categorical variables
print("=== Categorical Variables Frequency ===")
for col in cat_cols:
    freq = df[col].value_counts(normalize=True).mul(100).round(2)
    print(f"\n**{col}**\n{freq}")

    # Plot bar chart with annotations
    plt.figure(figsize=(10, 5))
    ax = sns.countplot(x=col, data=df, order=freq.index, palette='viridis')
    plt.title(f"Distribution of {col}", fontsize=14)
    plt.xticks(rotation=45)

    # Add percentage labels
    total = len(df[col])
    for p in ax.patches:
        percentage = f'{100 * p.get_height() / total:.1f}%'
        x = p.get_x() + p.get_width() / 2
        y = p.get_height() + 10
        ax.annotate(percentage, (x, y), ha='center')

    plt.show()

# Check for rare categories (<5% frequency)
print("\n=== Rare Categories (<5% frequency) ===")
for col in cat_cols:
    freq = df[col].value_counts(normalize=True).mul(100)
    rare = freq[freq < 5].index.tolist()
    if rare:
        print(f"{col}: {rare}")

## Analysis Report: Categorical Variables

**Attrition:**
- **No:** 83.9%
- **Yes:** 16.1%
- **Implication:** Severe class imbalance – attrition is rare but critical to study.

**Department:**
- **Research & Development:** 65.0%
- **Sales:** 28.0%
- **HR:** 7.0%
- **Insight:** R&D dominates the workforce; HR is underrepresented.

**EducationField:**
- **Life Sciences:** 41.2%
- **Medical:** 30.3%
- **Marketting:** 8.4%
- **Rare Categories:** Technical Degree (7.8%), Other (6.8%).

**JobRole:**
- **Sales Executive:** 22.4%
- **Research Scientist:** 19.3%
- **Laboratory Technician:** 17.6%
- **Rare Roles:** HR (3.9%), Manager (4.6%).

**BusinessTravel:**
- **Travel_Rarely:** 69.4%
- **Travel_Frequently:** 19.3%
- **Non-Travel:** 11.3%
- **Insight:** Most employees travel rarely.


## Advanced Insights

**Normality Tests:**
- **Age:** Near-normal (Shapiro-Wilk p = 0.062).
- **MonthlyIncome and DailyRate:** Non-normal (Shapiro-Wilk p = 0.000).
- **Action:** Use non-parametric tests for inferential analysis.

**Rare Categories:**
- **JobRole:** "Human Resources" (3.9%) and "Manager" (4.6%) are rare.
- **EducationField:** "Other" (6.8%) – investigate what this includes.

**Skewness Impact:**
- **MonthlyIncome:** Skewness of 0.95 implies mean > median – executives skew the distribution.
- **YearsAtCompany:** Skewness of 1.25 indicates long tenure is rare.
