## **Notebook Objective**

This notebook prepares the dataset for predictive modeling by:

* Encoding categorical variables
* Creating interaction and nonlinear features
* Transforming skewed targets
* Producing clean train/test datasets

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.float_format", "{:.4f}".format)

In [None]:
DATA_PATH = "../data/raw/insurance.csv"
df = pd.read_csv(DATA_PATH)

df.head()

In [None]:
target = "charges"

numerical_features = ["age", "bmi", "children"]
categorical_features = ["sex", "smoker", "region"]

numerical_features, categorical_features

In [None]:
sns.histplot(df[target], bins=50, kde=True)
plt.title("Original Charges Distribution")
plt.show()

In [None]:
df["log_charges"] = np.log1p(df[target])

In [None]:
sns.histplot(df["log_charges"], bins=50, kde=True)
plt.title("Log-Transformed Charges Distribution")
plt.show()

In [None]:
df["bmi_category"] = pd.cut(
    df["bmi"],
    bins=[0, 18.5, 25, 30, 100],
    labels=["underweight", "normal", "overweight", "obese"]
)

In [None]:
df["smoker_bmi_interaction"] = np.where(
    (df["smoker"] == "yes") & (df["bmi"] >= 30),
    1,
    0
)

In [None]:
df["age_squared"] = df["age"] ** 2

In [None]:
df[[
    "age", "age_squared",
    "bmi", "bmi_category",
    "smoker", "smoker_bmi_interaction",
    "log_charges"
]].head()

In [None]:
numerical_features_extended = [
    "age", "age_squared", "bmi", "children", "smoker_bmi_interaction"
]

categorical_features_extended = [
    "sex", "region", "bmi_category"
]

In [None]:
X = df[numerical_features_extended + categorical_features_extended]
y = df["log_charges"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

In [None]:
numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features_extended),
        ("cat", categorical_transformer, categorical_features_extended)
    ]
)

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

X_train_processed.shape, X_test_processed.shape

In [None]:
cat_features = preprocessor.named_transformers_["cat"] \
    .named_steps["onehot"] \
    .get_feature_names_out(categorical_features_extended)

feature_names = numerical_features_extended + list(cat_features)

len(feature_names)

In [None]:
import joblib

joblib.dump(preprocessor, "../data/processed/preprocessor.pkl")

In [None]:
feature_summary = {
    "Original Numerical Features": 3,
    "Engineered Numerical Features": len(numerical_features_extended),
    "Categorical Encoded Features": len(feature_names) - len(numerical_features_extended),
    "Target Transformation": "log1p(charges)",
    "High-Risk Interaction Added": True
}

pd.DataFrame.from_dict(feature_summary, orient="index", columns=["Value"])

## **Key Takeaways**

* Log-transformed target improves statistical behavior
* Interaction terms capture **nonlinear risk structures**
* Pipeline ensures reproducibility and deployment readiness
* Data is now **model-ready**