data preprocessing

In [None]:
# import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
# load data
readmit_data = pd.read_csv("healthcare_patient_journey.csv")
readmit_data.head()

Unnamed: 0,patient_id,age,gender,chronic_condition,admission_type,department,wait_time_min,length_of_stay_days,procedures_count,medication_count,complications,discharge_status,readmitted_30d,total_cost_€,satisfaction_score
0,1,69,male,0,scheduled,Neurology,41,2,0,3,1,referred,1,1440,2
1,2,38,male,0,emergency,Oncology,17,3,1,2,0,recovered,0,2060,3
2,3,81,male,0,scheduled,Neurology,40,2,3,2,0,recovered,0,2110,3
3,4,67,female,1,emergency,ER,7,4,5,9,0,recovered,0,4070,3
4,5,88,male,1,emergency,Cardiology,34,3,7,5,0,recovered,1,3800,3


In [10]:
readmit_data.info()

<class 'pandas.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   patient_id           3000 non-null   int64
 1   age                  3000 non-null   int64
 2   gender               3000 non-null   str  
 3   chronic_condition    3000 non-null   int64
 4   admission_type       3000 non-null   str  
 5   department           3000 non-null   str  
 6   wait_time_min        3000 non-null   int64
 7   length_of_stay_days  3000 non-null   int64
 8   procedures_count     3000 non-null   int64
 9   medication_count     3000 non-null   int64
 10  complications        3000 non-null   int64
 11  discharge_status     3000 non-null   str  
 12  readmitted_30d       3000 non-null   int64
 13  total_cost_€         3000 non-null   int64
 14  satisfaction_score   3000 non-null   int64
dtypes: int64(11), str(4)
memory usage: 351.7 KB


In [13]:
# identify features and outcome
outcome = "readmitted_30d"
y = readmit_data[outcome]
X = readmit_data.drop(columns=[outcome])

# identify variable types
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category', 'str']).columns

In [None]:
# numeric preprocessing
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")), # impute missing values with median
    ("scaler", StandardScaler()) # normalize features
])

# categorical preprocessing
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)