In [24]:
# 1. Imports
import pandas as pd
import numpy as np

In [25]:
# 2. Load raw data
df = pd.read_csv("../data/raw/hospital_readmissions.csv")
df.replace("?", np.nan, inplace=True)  # Clean up missing placeholders

print("Initial shape:", df.shape)

Initial shape: (101766, 50)


In [26]:
# 3. Drop irrelevant columns (deduced from eda)
columns_to_drop = [
    'encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty',
    'examide', 'citoglipton',
    'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'
]

df.drop(columns=columns_to_drop, axis=1, inplace=True)
print("After dropping columns:", df.shape)

After dropping columns: (101766, 40)


In [27]:
# 4. Handle missing values

# Fill missing race with "Unknown"
df.fillna({'race': 'Unknown'}, inplace=True)

# Drop invalid gender entries (if any)
df = df[df['gender'] != 'Unknown']

# Fill missing diagnoses with "Unknown"
df.fillna({'diag_1': 'Unknown', 'diag_2': 'Unknown', 'diag_3': 'Unknown'}, inplace=True)

In [28]:
# 5. Encode target (readmission) variable
df['readmitted_binary'] = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)
df.drop(columns=['readmitted'], axis=1, inplace=True)

In [29]:
# 6. Encode categorical features

# Diabetes med usage
df['diabetesMed'] = df['diabetesMed'].map({'Yes': 1, 'No': 0})
df['change'] = df['change'].map({'Ch': 1, 'No': 0})

# A1C and glucose serum
df.fillna({'max_glu_serum': "None", 'AlCresult': "None"}, inplace=True)

# Age buckets → ordinal scale
age_order = {
    '[0-10)': 0, '[10-20)': 1, '[20-30)': 2, '[30-40)': 3, '[40-50)': 4,
    '[50-60)': 5, '[60-70)': 6, '[70-80)': 7, '[80-90)': 8, '[90-100)': 9
}
df['age'] = df['age'].map(age_order)

# Confirm it worked
print(df['age'].unique())

# Encode insulin/metformin: ordinal
ordinal_map = {'No': 0, 'Down': 1, 'Steady': 2, 'Up': 3}
df['insulin'] = df['insulin'].map(ordinal_map)
df['metformin'] = df['metformin'].map(ordinal_map)

[0 1 2 3 4 5 6 7 8 9]


In [30]:
# 7. One-hot encode the following:

categorical_cols = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
                    'max_glu_serum', 'A1Cresult']

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [31]:
df.columns

Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'change', 'diabetesMed', 'readmitted_binary', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Other', 'race_Unknown',
       'gender_Male', 'gender_Unknown/Invalid', 'admission_type_id_2',
       'admission_type_id_3', 'admission_type_id_4', 'admission_type_id_5',
       'admission_type_id_6', 'admission_type_id_7', 'admission_type_id_8',
       'discharge_disposition_id_2', 'discharge_disposition_id_3',
       'discharge_disposition_id_4', 'discharge

In [32]:
df.to_csv("../data/processed/cleaned_data.csv", index=False)
print("✅ Cleaned data saved to data/processed/cleaned_data.csv")

✅ Cleaned data saved to data/processed/cleaned_data.csv


In [33]:
df.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,admission_source_id_14,admission_source_id_17,admission_source_id_20,admission_source_id_22,admission_source_id_25,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>8,A1Cresult_Norm
0,0,1,41,0,1,0,0,0,250.83,Unknown,...,False,False,False,False,False,False,True,False,False,False
1,1,3,59,0,18,0,0,0,276.0,250.01,...,False,False,False,False,False,False,True,False,False,False
2,2,2,11,5,13,2,0,1,648.0,250,...,False,False,False,False,False,False,True,False,False,False
3,3,2,44,1,16,0,0,0,8.0,250.43,...,False,False,False,False,False,False,True,False,False,False
4,4,1,51,0,8,0,0,0,197.0,157,...,False,False,False,False,False,False,True,False,False,False
