# 03 — Feature Engineering & Preprocessing## HumanForYou — Employee Attrition Prediction---### ObjectiveTransform raw merged data into **model-ready features**:1. Handle missing values with justified strategies2. Encode categorical variables (ordinal vs. one-hot)3. Engineer new features from existing ones4. Scale numerical features5. Address class imbalance (SMOTE)6. Export train/test splits for modeling> This notebook expects `merged_data.csv` from **01_Data_Validation_Pipeline**.

## Section 1: Setup

In [None]:
# ==============================================================================
# IMPORTS
# ==============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# --- Path Configuration (same logic as notebook 01) ---
_cwd = Path.cwd()
if (_cwd / "data" / "raw").exists():
    PROJECT_ROOT = _cwd
elif (_cwd.parent / "data" / "raw").exists():
    PROJECT_ROOT = _cwd.parent
else:
    PROJECT_ROOT = Path(r"c:\Users\yanis\Documents\CESI\A5\AI Project\HumanForYou")

OUTPUT_DIR = str(PROJECT_ROOT / "outputs")

df = pd.read_csv(f"{OUTPUT_DIR}/merged_data.csv")

# Binary target
df["Attrition"] = (df["Attrition"] == "Yes").astype(int)

print(f"Loaded: {df.shape[0]} rows x {df.shape[1]} columns")

## Section 2: Feature Engineering & Encoding (before split)

Features that don't require fitting on data (no leakage risk) are created before the split.
Imputation happens **after** the split to avoid data leakage.

In [None]:
# ==============================================================================
# FEATURE ENGINEERING (safe before split — no fitted statistics)
# ==============================================================================

# Income per job level
if "MonthlyIncome" in df.columns and "JobLevel" in df.columns:
    df["IncomePerJobLevel"] = df["MonthlyIncome"] / df["JobLevel"]

# Promotion stagnation
if "YearsSinceLastPromotion" in df.columns and "YearsAtCompany" in df.columns:
    df["PromotionStagnation"] = df["YearsSinceLastPromotion"] / (df["YearsAtCompany"] + 1)

# Satisfaction composite score
survey_items = ["EnvironmentSatisfaction", "JobSatisfaction", "WorkLifeBalance"]
existing_items = [c for c in survey_items if c in df.columns]
if existing_items:
    df["SatisfactionScore"] = df[existing_items].mean(axis=1)

# Manager stability
if "YearsWithCurrManager" in df.columns and "YearsAtCompany" in df.columns:
    df["ManagerStability"] = df["YearsWithCurrManager"] / (df["YearsAtCompany"] + 1)

# Long hours proxy
if "avg_working_hours" in df.columns:
    df["LongHours"] = (df["avg_working_hours"] > 9).astype(int)

new_features = ["IncomePerJobLevel", "PromotionStagnation", "SatisfactionScore", "ManagerStability", "LongHours"]
new_features = [f for f in new_features if f in df.columns]
print(f"New features created: {new_features}")

# ==============================================================================
# CATEGORICAL ENCODING (deterministic — no leakage)
# ==============================================================================

cat_cols = df.select_dtypes(include="object").columns.tolist()
print(f"Categorical columns to encode: {cat_cols}")

# Ordinal encoding for BusinessTravel
bt_map = {"Non-Travel": 0, "Travel_Rarely": 1, "Travel_Frequently": 2}
if "BusinessTravel" in df.columns:
    df["BusinessTravel"] = df["BusinessTravel"].map(bt_map)

# One-hot encoding for remaining categoricals
ohe_cols = [c for c in cat_cols if c != "BusinessTravel"]
df = pd.get_dummies(df, columns=ohe_cols, drop_first=True, dtype=int)

print(f"Post-encoding shape: {df.shape[0]} rows x {df.shape[1]} columns")

## Section 3: Train / Test Split

Split **before** imputation to prevent data leakage.

In [None]:
# ==============================================================================
# TRAIN / TEST SPLIT (before imputation to avoid leakage)
# ==============================================================================

# Save and drop EmployeeID
if "EmployeeID" in df.columns:
    employee_ids = df["EmployeeID"].copy()
    df = df.drop(columns=["EmployeeID"])

X = df.drop(columns=["Attrition"])
y = df["Attrition"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape[0]} samples ({y_train.mean()*100:.1f}% attrition)")
print(f"Test set:  {X_test.shape[0]} samples ({y_test.mean()*100:.1f}% attrition)")
print(f"Missing in train: {X_train.isnull().sum().sum()}")
print(f"Missing in test:  {X_test.isnull().sum().sum()}")

## Section 4: Imputation & Scaling (fit on train only)

**Strategy**:
- Median imputation for all columns with missing values (robust to outliers)
- **Fit on train, transform both** — no leakage

In [None]:
# ==============================================================================
# IMPUTATION — fit on train only, transform both
# ==============================================================================

# Identify columns with missing values
cols_with_na = X_train.columns[X_train.isnull().any()].tolist()
print(f"Columns to impute: {cols_with_na}")

# Median imputer — fit on train only
imputer = SimpleImputer(strategy="median")
X_train[cols_with_na] = imputer.fit_transform(X_train[cols_with_na])
X_test[cols_with_na] = imputer.transform(X_test[cols_with_na])

print(f"Imputation complete (median, fit on train only)")
print(f"Remaining NaN — train: {X_train.isnull().sum().sum()}, test: {X_test.isnull().sum().sum()}")

# ==============================================================================
# SCALING — fit on train only
# ==============================================================================

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test),      columns=X_test.columns,  index=X_test.index)

print("Scaling applied (StandardScaler — fit on train only).")

## Section 5: Class Imbalance — SMOTE

Apply SMOTE **only on the training set** to avoid data leakage.

In [None]:
# ==============================================================================
# SMOTE OVERSAMPLING (train set only)
# ==============================================================================

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"Before SMOTE: {y_train.value_counts().to_dict()}")
print(f"After  SMOTE: {pd.Series(y_train_resampled).value_counts().to_dict()}")

## Section 6: Export Preprocessed Data

In [ ]:
# ==============================================================================
# EXPORT
# ==============================================================================
import joblib

# Save processed data
X_train_resampled.to_csv(f"{OUTPUT_DIR}/X_train.csv", index=False)
X_test_scaled.to_csv(f"{OUTPUT_DIR}/X_test.csv", index=False)
pd.Series(y_train_resampled, name="Attrition").to_csv(f"{OUTPUT_DIR}/y_train.csv", index=False)
y_test.to_csv(f"{OUTPUT_DIR}/y_test.csv", index=False)

# Also save non-SMOTE versions for fairness analysis and honest CV
X_train_scaled.to_csv(f"{OUTPUT_DIR}/X_train_no_smote.csv", index=False)
y_train.to_csv(f"{OUTPUT_DIR}/y_train_no_smote.csv", index=False)

# Save pre-scaling train/test for fairness (unscaled binary columns)
X_train.to_csv(f"{OUTPUT_DIR}/X_train_unscaled.csv", index=False)
X_test.to_csv(f"{OUTPUT_DIR}/X_test_unscaled.csv", index=False)

# Save scaler and imputer for reproducibility
joblib.dump(scaler, f"{OUTPUT_DIR}/scaler.joblib")
joblib.dump(imputer, f"{OUTPUT_DIR}/imputer.joblib")

# Save feature names
feature_names = list(X_train.columns)
pd.Series(feature_names).to_csv(f"{OUTPUT_DIR}/feature_names.csv", index=False, header=False)

print(f"Exported to {OUTPUT_DIR}/:")
print(f"  X_train.csv              ({X_train_resampled.shape}) — SMOTE + scaled")
print(f"  X_test.csv               ({X_test_scaled.shape}) — scaled")
print(f"  X_train_unscaled.csv     ({X_train.shape}) — for fairness analysis")
print(f"  X_test_unscaled.csv      ({X_test.shape}) — for fairness analysis")
print(f"  y_train.csv / y_test.csv")
print(f"  scaler.joblib / imputer.joblib")
print(f"  feature_names.csv        ({len(feature_names)} features)")
print("\nPipeline: split -> impute (fit train) -> scale (fit train) -> SMOTE (train only)")
print("No data leakage.")
print("\n-> Proceed to 04_Model_Benchmark.ipynb")