In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# === 1. Load Dataset ===


In [None]:
df = pd.read_csv("to_prepareation_phase.csv").drop(columns=['AdmissionTime', 'AgeGroup', 'BMI'])
df['Participant_Id'] = df['Participant_Id'].astype(str)

# === 2. Handle Vital Sign Outliers and Missing Values ===


In [None]:
vital_prefixes = ['HeartRate_', 'PulseRate_', 'BreathingRate_', 'SpO2_', 'SkinTemp_', 'SysBp_', 'DiasBp_', 'MapBp_']
vital_cols = [col for col in df.columns if any(col.startswith(prefix) for prefix in vital_prefixes)]

# IQR-based outlier removal
for col in vital_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df[col] = df[col].mask((df[col] < lower_bound) | (df[col] > upper_bound), np.nan)

# KNN imputation (k=5)
knn_imputer = KNNImputer(n_neighbors=5)
df[vital_cols] = knn_imputer.fit_transform(df[vital_cols])


# === 3. One-Hot Encoding for Categorical Columns ===


In [None]:
df = pd.get_dummies(df, columns=['SexLabel', 'SurgeryLabel'])


# === 4. Fix ASA Score column ===


In [None]:
asa_map = {
    1.0 : 'Healthy_patient',
    2.0 : 'Mild_systemic_disease',
    3.0 : 'Severe_systemic_disease',
    4.0 : 'Constant_life_threat',
    5.0 : 'Moribund_patient'
}

age_bins = [0, 40, 55, 70, 85, 100]
age_labels = ['<40', '40-55', '55-70', '70-85', '85+']
df['Age_Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)
df['Age_Group'].value_counts()

df['BMI_category'] = df.apply(categorize_bmi, axis=1)
df['ASA_Score'] = df['ASA_Score'].replace(asa_map).astype('category')

df = pd.get_dummies(df, columns=['SexLabel', 'SurgeryLabel', 'ASA_Score', 'BMI_category', 'Age_Group'], dtype='int')


# === 5. Drop Columns Not Used in Modeling ===


In [None]:
drop_cols = [
    'Outcome',           # Already used to define ICU_Admission
    'ICU_Type',          # Redundant with ICU_Admission
    'ICU_Label',         # Same as ICU_Admission
    'Sex',               # Already encoded via SexLabel
    'SurgeryType'        # Already encoded via SurgeryLabel
]
df = df.drop(columns=drop_cols)

# === 6. Save Prepared Data ===


In [None]:
df.to_csv("prepared_clinical_dataset.csv", index=False)
print("âœ… Data preparation completed and saved to 'prepared_clinical_dataset.csv'")