In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option("display.max_columns", None)

# Load data
df = pd.read_csv("../data/post-operative-data.csv")

In [3]:
# Clean decision column
df['decision ADM-DECS'] = df['decision ADM-DECS'].str.strip()
df = df[df['decision ADM-DECS'].isin(['A', 'S'])]


In [4]:
# Rename
df = df.rename(columns={"decision ADM-DECS": "decision"})
df.columns = (
    df.columns
    .str.lower()
    .str.replace("-", "_")
    .str.strip()
)

In [5]:
# Comfort cleaning
df['comfort'] = df['comfort'].replace('?', np.nan)
df['comfort'] = df['comfort'].astype(float)
df['comfort'] = df['comfort'].fillna(df['comfort'].median())

In [6]:
# Encode target
df['decision'] = df['decision'].map({'A': 1, 'S': 0})

In [7]:
# Ordinal mappings
risk_map = {'mid': 0, 'low': 1, 'high': 1}
stability_map = {'stable': 0, 'mod-stable': 1, 'unstable': 2}
o2_map = {'excellent': 0, 'good': 1}

for col in ['l_core', 'l_surf', 'l_bp']:
    df[col] = df[col].map(risk_map)

df['l_o2'] = df['l_o2'].map(o2_map)
df['surf_stbl'] = df['surf_stbl'].map({'stable': 0, 'unstable': 1})
df['core_stbl'] = df['core_stbl'].map(stability_map)
df['bp_stbl'] = df['bp_stbl'].map(stability_map)

In [8]:
# Final check
print(df.isna().sum())

l_core       0
l_surf       0
l_o2         0
l_bp         0
surf_stbl    0
core_stbl    0
bp_stbl      0
comfort      0
decision     0
dtype: int64


In [9]:
# Save cleaned data
df.to_csv("../data/cleaned_encoded_data.csv", index=False)