In [None]:
# 02_feature_engineering.ipynb
# -----------------------------------------------------------
# Purpose: Perform feature engineering on the cleaned dataset.
# Steps:
# 1. Load cleaned dataset
# 2. Handle missing values and outliers
# 3. Encode categorical variables
# 4. Create domain-specific features (e.g., debt-to-income ratio, credit utilization)
# 5. Scale numerical features
# 6. Save processed dataset
# -----------------------------------------------------------

# Import libraries
from __future__ import annotations
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from typing import Tuple

# ---------------------------
# 1. Load Data
# ---------------------------

def load_cleaned_data(filepath: str) -> pd.DataFrame:
    """
    Load cleaned dataset from a CSV file.

    Args:
        filepath (str): Path to the cleaned CSV file.
    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    return pd.read_csv(filepath)

data_path: str = "../data/processed/credit_data_cleaned.csv"
df: pd.DataFrame = load_cleaned_data(data_path)

display(df.head())
display(df.info())

# ---------------------------
# 2. Handle Missing Values
# ---------------------------

def handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
    """
    Fill missing values:
      - Numeric columns: filled with median
      - Categorical columns: filled with mode

    Args:
        data (pd.DataFrame): Input DataFrame.
    Returns:
        pd.DataFrame: DataFrame with missing values handled.
    """
    for col in data.columns:
        if data[col].isnull().sum() > 0:
            if data[col].dtype in ['int64', 'float64']:
                median_val = data[col].median()
                data[col].fillna(median_val, inplace=True)
            else:
                mode_val = data[col].mode()[0]
                data[col].fillna(mode_val, inplace=True)
    return data

df = handle_missing_values(df)

# ---------------------------
# 3. Handle Outliers
# ---------------------------

def remove_outliers_iqr(data: pd.DataFrame, columns: list[str], factor: float = 1.5) -> pd.DataFrame:
    """
    Remove outliers from numerical columns using the IQR method.

    Args:
        data (pd.DataFrame): Input DataFrame.
        columns (list[str]): Numerical columns to check for outliers.
        factor (float): IQR multiplier for outlier detection.
    Returns:
        pd.DataFrame: DataFrame with outliers removed.
    """
    for col in columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
    return data

numeric_cols: list[str] = df.select_dtypes(include=[np.number]).columns.tolist()
df = remove_outliers_iqr(df, numeric_cols)

# ---------------------------
# 4. Feature Engineering
# ---------------------------

def create_domain_features(data: pd.DataFrame) -> pd.DataFrame:
    """
    Create domain-specific features, such as:
      - Debt-to-income ratio
      - Credit utilization rate

    Args:
        data (pd.DataFrame): Input DataFrame.
    Returns:
        pd.DataFrame: DataFrame with new features added.
    """
    # Example features: Adjust column names as needed based on dataset
    if {'total_debt', 'annual_income'}.issubset(data.columns):
        data['debt_to_income'] = data['total_debt'] / (data['annual_income'] + 1e-5)
    
    if {'current_balance', 'total_credit_limit'}.issubset(data.columns):
        data['credit_utilization'] = data['current_balance'] / (data['total_credit_limit'] + 1e-5)
    
    return data

df = create_domain_features(df)

# ---------------------------
# 5. Encode Categorical Variables
# ---------------------------

def encode_categorical_features(data: pd.DataFrame, categorical_cols: list[str]) -> Tuple[pd.DataFrame, OneHotEncoder]:
    """
    Encode categorical variables using OneHotEncoder.

    Args:
        data (pd.DataFrame): Input DataFrame.
        categorical_cols (list[str]): List of categorical columns.
    Returns:
        Tuple[pd.DataFrame, OneHotEncoder]: Transformed DataFrame and fitted encoder.
    """
    encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
    encoded = encoder.fit_transform(data[categorical_cols])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))
    
    data = pd.concat([data.drop(columns=categorical_cols), encoded_df], axis=1)
    return data, encoder

categorical_cols: list[str] = df.select_dtypes(exclude=[np.number]).columns.tolist()
df, ohe_encoder = encode_categorical_features(df, categorical_cols)

# ---------------------------
# 6. Scale Numerical Features
# ---------------------------

def scale_features(data: pd.DataFrame, columns: list[str]) -> Tuple[pd.DataFrame, StandardScaler]:
    """
    Scale numerical features using StandardScaler.

    Args:
        data (pd.DataFrame): Input DataFrame.
        columns (list[str]): Columns to scale.
    Returns:
        Tuple[pd.DataFrame, StandardScaler]: Scaled DataFrame and fitted scaler.
    """
    scaler = StandardScaler()
    data[columns] = scaler.fit_transform(data[columns])
    return data, scaler

df, scaler = scale_features(df, numeric_cols)

# ---------------------------
# 7. Save Processed Data
# ---------------------------

def save_processed_data(data: pd.DataFrame, filepath: str) -> None:
    """
    Save processed DataFrame to a CSV file.

    Args:
        data (pd.DataFrame): DataFrame to save.
        filepath (str): Destination file path.
    """
    data.to_csv(filepath, index=False)
    print(f"Processed feature data saved at {filepath}")

save_processed_data(df, "../data/processed/credit_data_features.csv")
