preprocessing

1. load data and define outcome
2. create custom winsorizer (addressing outliers)
3. feature engineering
4. feature selection
5. preprocessing pipelines
6. train/test split (leakage-aware)
7. baseline QA

In [10]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

1. load data and define outcome

In [8]:
readmit_data = pd.read_csv("healthcare_patient_journey.csv")
print(readmit_data.head())

outcome = "readmitted_30d"

# separate outcome
X = readmit_data.drop(columns=[outcome], errors="ignore")
y = readmit_data[outcome]

   patient_id  age  gender  chronic_condition admission_type  department  \
0           1   69    male                  0      scheduled   Neurology   
1           2   38    male                  0      emergency    Oncology   
2           3   81    male                  0      scheduled   Neurology   
3           4   67  female                  1      emergency          ER   
4           5   88    male                  1      emergency  Cardiology   

   wait_time_min  length_of_stay_days  procedures_count  medication_count  \
0             41                    2                 0                 3   
1             17                    3                 1                 2   
2             40                    2                 3                 2   
3              7                    4                 5                 9   
4             34                    3                 7                 5   

   complications discharge_status  readmitted_30d  total_cost_€  \
0            

2. create custom winsorizer

In [15]:
class Winsorizer(BaseEstimator, TransformerMixin):
    """
    Winsorization caps extreme values instead of removing them.

    Here I will set lower bound to 5th percentile and upper bound to 95th percentile.
    """
    def __init__(self, lower_q=0.05, upper_q=0.95): # stores percentile thresholds
        self.lower_q = lower_q
        self.upper_q = upper_q
    
    def fit(self, X, y=None): # calculates percentiles column-wise
        X = self._to_array(X)
        self.lower_bounds_ = np.nanpercentile(X, self.lower_q*100, axis=0)
        self.upper_bounds_ = np.nanpercentile(X, self.upper_q*100, axis=0)
        return self
    
    def transform(self, X): # column-wise capping
        X = self._to_array(X).copy()
        X = np.clip(X, self.lower_bounds_, self.upper_bounds_)
        return X
    
    def _to_array(self, X):
        return np.asarray(X)

3. feature engineering

In [16]:
def engineer_features(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()

    # drop patient ID
    X = X.drop(columns=["patient_id"], errors="ignore")

    # age features
    X["is_elderly"] = (X["age"] >= 65).astype(int)
    X["age_group"] = pd.cut(
        X["age"],
        bins=[0, 40, 65, 80, 120],
        labels=["young", "adult", "senior", "elderly"]
    )

    # utilization intensity features
    X["procedures_per_day"] = X["procedures_count"] / (X["length_of_stay_days"] + 1)
    X["meds_per_day"] = X["medication_count"] / (X["length_of_stay_days"] + 1)

    # length of stay features
    X["log_LOS"] = np.log1p(X["length_of_stay_days"])
    X["long_stay"] = (X["length_of_stay_days"] > 7).astype(int)

    # cost feature
    X["cost_per_day"] = X["total_cost_€"] / (X["length_of_stay_days"])

    # satisfaction feature
    X["low_satisfaction"] = (X["satisfaction_score"] <= 3).astype(int)

    return X

feature_eng = FunctionTransformer(engineer_features)

4. feature selection

In [17]:
# fit feature engineering
X_fe = feature_eng.fit_transform(X) # fit() learns parameters, transform() applies learned parameters to data

num_cols = make_column_selector(dtype_include=np.number)(X_fe)
cat_cols = make_column_selector(dtype_include=["object","category","bool"])(X_fe)

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['age', 'chronic_condition', 'wait_time_min', 'length_of_stay_days', 'procedures_count', 'medication_count', 'complications', 'total_cost_€', 'satisfaction_score', 'is_elderly', 'procedures_per_day', 'meds_per_day', 'log_LOS', 'long_stay', 'cost_per_day', 'low_satisfaction']
Categorical columns: ['gender', 'admission_type', 'department', 'discharge_status', 'age_group']


5. preprocessing pipelines

In [18]:
# numeric pipeline: winsorization -> impute -> scale
numeric_transformer = Pipeline([
    ("winsor", Winsorizer(lower_q=0.05, upper_q=0.95)),
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# categorical pipeline: impute -> OneHotEncode
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

6. train/test split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,
    stratify=y # for classification
)

7. baseline QA

In [14]:
print("Overall readmission rate:", readmit_data[outcome].mean())
print("Train readmission rate:", y_train.mean())
print("Test readmission rate:", y_test.mean())

Overall readmission rate: 0.23466666666666666
Train readmission rate: 0.23458333333333334
Test readmission rate: 0.235


full pipeline

In [None]:
pipeline = Pipeline([
    ("feature_eng", feature_eng),
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

pipeline.fit(X_train, y_train)