# 03 — Feature Engineering & Preprocessing## HumanForYou — Employee Attrition Prediction---### ObjectiveTransform raw merged data into **model-ready features**:1. Handle missing values with justified strategies2. Encode categorical variables (ordinal vs. one-hot)3. Engineer new features from existing ones4. Scale numerical features5. Address class imbalance (SMOTE)6. Export train/test splits for modeling> This notebook expects `merged_data.csv` from **01_Data_Validation_Pipeline**.

## Section 1: Setup

In [None]:
# ==============================================================================
# IMPORTS
# ==============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

OUTPUT_DIR = "../outputs"
df = pd.read_csv(f"{OUTPUT_DIR}/merged_data.csv")

# Binary target
df["Attrition"] = (df["Attrition"] == "Yes").astype(int)

print(f"Loaded: {df.shape[0]} rows × {df.shape[1]} columns")

## Section 2: Missing Value Treatment**Strategy**:- Survey columns (EnvironmentSatisfaction, JobSatisfaction, WorkLifeBalance): impute with **median** (ordinal data)- Badge features: impute with **median** (continuous data)- Justify: median is robust to outliers and preserves the ordinal nature of survey scales

In [None]:
# ==============================================================================
# MISSING VALUE IMPUTATION
# ==============================================================================

print("MISSING VALUES BEFORE IMPUTATION")
print("=" * 65)
missing = df.isnull().sum()
missing = missing[missing > 0]
print(missing.to_string())

# Survey columns — median imputation
survey_cols = ["EnvironmentSatisfaction", "JobSatisfaction", "WorkLifeBalance"]
for col in survey_cols:
    if col in df.columns and df[col].isnull().any():
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"  → {col}: imputed {df[col].isnull().sum()} NaN with median = {median_val}")

# Badge features — median imputation
badge_cols = ["avg_arrival_hour", "avg_departure_hour", "avg_working_hours", "absence_rate", "late_arrival_rate"]
for col in badge_cols:
    if col in df.columns and df[col].isnull().any():
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"  → {col}: imputed with median = {median_val:.3f}")

# NumCompaniesWorked — median imputation
if "NumCompaniesWorked" in df.columns and df["NumCompaniesWorked"].isnull().any():
    df["NumCompaniesWorked"].fillna(df["NumCompaniesWorked"].median(), inplace=True)

# TotalWorkingYears
if "TotalWorkingYears" in df.columns and df["TotalWorkingYears"].isnull().any():
    df["TotalWorkingYears"].fillna(df["TotalWorkingYears"].median(), inplace=True)

print(f"\nRemaining missing values: {df.isnull().sum().sum()}")

## Section 3: Feature EngineeringCreate new meaningful features from existing data to improve model performance.

In [None]:
# ==============================================================================
# NEW FEATURES
# ==============================================================================

# Income per job level — normalizes salary by hierarchy
if "MonthlyIncome" in df.columns and "JobLevel" in df.columns:
    df["IncomePerJobLevel"] = df["MonthlyIncome"] / df["JobLevel"]

# Years since last promotion relative to company tenure
if "YearsSinceLastPromotion" in df.columns and "YearsAtCompany" in df.columns:
    df["PromotionStagnation"] = df["YearsSinceLastPromotion"] / (df["YearsAtCompany"] + 1)

# Satisfaction composite score (mean of survey items)
survey_items = ["EnvironmentSatisfaction", "JobSatisfaction", "WorkLifeBalance"]
existing_items = [c for c in survey_items if c in df.columns]
if existing_items:
    df["SatisfactionScore"] = df[existing_items].mean(axis=1)

# Years with manager / years at company ratio
if "YearsWithCurrManager" in df.columns and "YearsAtCompany" in df.columns:
    df["ManagerStability"] = df["YearsWithCurrManager"] / (df["YearsAtCompany"] + 1)

# Overtime proxy: working hours > 9h
if "avg_working_hours" in df.columns:
    df["LongHours"] = (df["avg_working_hours"] > 9).astype(int)

new_features = ["IncomePerJobLevel", "PromotionStagnation", "SatisfactionScore", "ManagerStability", "LongHours"]
new_features = [f for f in new_features if f in df.columns]
print(f"New features created: {new_features}")
df[new_features].describe().round(3)

## Section 4: Categorical Encoding**Strategy**:- **Ordinal encoding** for features with natural order (Education, BusinessTravel)- **One-hot encoding** for nominal features (Department, EducationField, JobRole, Gender, MaritalStatus)> **Ethical note**: Gender and MaritalStatus are encoded but will be monitored for fairness during model evaluation.

In [None]:
# ==============================================================================
# CATEGORICAL ENCODING
# ==============================================================================

# Identify categorical columns
cat_cols = df.select_dtypes(include="object").columns.tolist()
print(f"Categorical columns to encode: {cat_cols}")

# Ordinal encoding for BusinessTravel
bt_map = {"Non-Travel": 0, "Travel_Rarely": 1, "Travel_Frequently": 2}
if "BusinessTravel" in df.columns:
    df["BusinessTravel"] = df["BusinessTravel"].map(bt_map)

# One-hot encoding for remaining categoricals
ohe_cols = [c for c in cat_cols if c != "BusinessTravel"]
df = pd.get_dummies(df, columns=ohe_cols, drop_first=True, dtype=int)

print(f"\nPost-encoding shape: {df.shape[0]} rows × {df.shape[1]} columns")

## Section 5: Train / Test Split & Scaling

In [None]:
# ==============================================================================
# TRAIN / TEST SPLIT
# ==============================================================================

# Drop EmployeeID (not a feature)
if "EmployeeID" in df.columns:
    employee_ids = df["EmployeeID"].copy()  # save for later traceability
    df.drop(columns=["EmployeeID"], inplace=True)

X = df.drop(columns=["Attrition"])
y = df["Attrition"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape[0]} samples ({y_train.mean()*100:.1f}% attrition)")
print(f"Test set:  {X_test.shape[0]} samples ({y_test.mean()*100:.1f}% attrition)")

In [None]:
# ==============================================================================
# FEATURE SCALING — StandardScaler
# ==============================================================================

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test),      columns=X_test.columns,  index=X_test.index)

print("Scaling applied (StandardScaler — fit on train only).")

## Section 6: Class Imbalance — SMOTEApply SMOTE **only on the training set** to avoid data leakage.

In [None]:
# ==============================================================================
# SMOTE OVERSAMPLING (train set only)
# ==============================================================================

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"Before SMOTE: {y_train.value_counts().to_dict()}")
print(f"After  SMOTE: {pd.Series(y_train_resampled).value_counts().to_dict()}")

## Section 7: Export Preprocessed Data

In [None]:
# ==============================================================================
# EXPORT
# ==============================================================================
import joblib

# Save processed data
X_train_resampled.to_csv(f"{OUTPUT_DIR}/X_train.csv", index=False)
X_test_scaled.to_csv(f"{OUTPUT_DIR}/X_test.csv", index=False)
pd.Series(y_train_resampled, name="Attrition").to_csv(f"{OUTPUT_DIR}/y_train.csv", index=False)
y_test.to_csv(f"{OUTPUT_DIR}/y_test.csv", index=False)

# Also save non-SMOTE versions for fairness analysis
X_train_scaled.to_csv(f"{OUTPUT_DIR}/X_train_no_smote.csv", index=False)
y_train.to_csv(f"{OUTPUT_DIR}/y_train_no_smote.csv", index=False)

# Save scaler for reproducibility
joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.joblib")

# Save feature names
feature_names = list(X_train.columns)
pd.Series(feature_names).to_csv(f"{OUTPUT_DIR}/feature_names.csv", index=False, header=False)

print(f"Exported to {OUTPUT_DIR}/:")
print(f"  X_train.csv          ({X_train_resampled.shape})")
print(f"  X_test.csv           ({X_test_scaled.shape})")
print(f"  y_train.csv          ({len(y_train_resampled)})")
print(f"  y_test.csv           ({len(y_test)})")
print(f"  feature_names.csv    ({len(feature_names)} features)")
print(f"  scaler.joblib")
print("\n✓ Preprocessing complete — proceed to 04_Model_Benchmark.ipynb")