data preprocessing

In [None]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

1. load data

In [6]:
readmit_data = pd.read_csv("healthcare_patient_journey.csv")
readmit_data.head()

Unnamed: 0,patient_id,age,gender,chronic_condition,admission_type,department,wait_time_min,length_of_stay_days,procedures_count,medication_count,complications,discharge_status,readmitted_30d,total_cost_€,satisfaction_score
0,1,69,male,0,scheduled,Neurology,41,2,0,3,1,referred,1,1440,2
1,2,38,male,0,emergency,Oncology,17,3,1,2,0,recovered,0,2060,3
2,3,81,male,0,scheduled,Neurology,40,2,3,2,0,recovered,0,2110,3
3,4,67,female,1,emergency,ER,7,4,5,9,0,recovered,0,4070,3
4,5,88,male,1,emergency,Cardiology,34,3,7,5,0,recovered,1,3800,3


2. identify outcome and features

In [7]:
# identify outcome
outcome = "readmitted_30d"

# identify feature types
numeric_features = readmit_data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = readmit_data.select_dtypes(include=['object', 'category', 'str']).columns

print("Outcome:", outcome)
print("Numeric:", numeric_features)
print("Categorical:", categorical_features)

Outcome: readmitted_30d
Numeric: Index(['patient_id', 'age', 'chronic_condition', 'wait_time_min',
       'length_of_stay_days', 'procedures_count', 'medication_count',
       'complications', 'readmitted_30d', 'total_cost_€',
       'satisfaction_score'],
      dtype='str')
Categorical: Index(['gender', 'admission_type', 'department', 'discharge_status'], dtype='str')


3. feature engineering

In [None]:
def engineer_features(X: pd.DataFrame) -> pd.DataFrame:
    X = readmit_data.copy()

    # age features
    X["is_elderly"] = (X["age"] >= 65).astype(int)
    X["age_group"] = pd.cut(
        X["age"],
        bins=[0, 40, 65, 80, 120],
        labels=["young", "adult", "senior", "elderly"]
    )

    # utilization intensity features
    X["procedures_per_day"] = X["procedures_count"] / (X["length_of_stay_days"] + 1)
    X["meds_per_day"] = X["medication_count"] / (X["length_of_stay_days"] + 1)

    # length of stay features
    X["log_LOS"] = np.log1p(X["length_of_stay_days"])
    X["long_stay"] = (X["length_of_stay_days"] > 7).astype(int)

    # cost feature
    X["cost_per_day"] = X["total_cost_€"] / (X["length_of_stay_days"])

    # satisfaction feature
    X["low_satisfaction"] = (X["satisfaction_score"] <= 3).astype(int)

    return X

feature_eng = FunctionTransformer(engineer_features, feature_names_out = "one-to-one")

4. feature selection

5. EDA for engineered features

In [None]:
# numeric preprocessing
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")), # impute missing values with median
    ("scaler", StandardScaler()) # normalize features
])

# categorical preprocessing
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), # replace missing category with most common category
    ("encoder", OneHotEncoder(handle_unknown="ignore")) # if new category appears in test data, fill in with 0s
])

# combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

x. split into training and test data

In [None]:
outcome = "readmitted_30d"

X = readmit_data.drop(outcome, axis=1)
y = readmit_data[outcome]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,
    stratify=y # for classification
)