In [1]:
import pandas as pd

admissions = pd.read_csv("hospital_admissions.csv")

In [2]:
admissions.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [3]:
admissions = admissions.copy()

# standardize columns
admissions.columns = admissions.columns.str.lower().str.strip().str.replace(" ", "_")

# drop PII & noise
admissions = admissions.drop(
    columns=[
        "name", "doctor", "insurance_provider",
        "medication", "test_results", "room_number"
    ],
    errors="ignore"
)

# clean text columns
text_cols = ["medical_condition", "admission_type", "hospital"]

admissions[text_cols] = admissions[text_cols].apply(
    lambda col: col.str.lower().str.strip()
)

# convert dates
admissions["date_of_admission"] = pd.to_datetime(admissions["date_of_admission"], errors="coerce")
admissions["discharge_date"] = pd.to_datetime(admissions["discharge_date"], errors="coerce")

# length of stay
admissions["length_of_stay"] = (
    admissions["discharge_date"] - admissions["date_of_admission"]
).dt.days

# remove invalid rows
admissions = admissions[admissions["length_of_stay"] >= 0]

# handle missing
admissions = admissions.dropna(
    subset=["medical_condition", "admission_type", "date_of_admission", "discharge_date"]
)


In [4]:
admissions.head()

Unnamed: 0,age,gender,blood_type,medical_condition,date_of_admission,hospital,billing_amount,admission_type,discharge_date,length_of_stay
0,30,Male,B-,cancer,2024-01-31,sons and miller,18856.281306,urgent,2024-02-02,2
1,62,Male,A+,obesity,2019-08-20,kim inc,33643.327287,emergency,2019-08-26,6
2,76,Female,A-,obesity,2022-09-22,cook plc,27955.096079,emergency,2022-10-07,15
3,28,Female,O+,diabetes,2020-11-18,"hernandez rogers and vang,",37909.78241,elective,2020-12-18,30
4,43,Female,AB+,cancer,2022-09-19,white-white,14238.317814,urgent,2022-10-09,20


In [5]:
from pathlib import Path

Path("cleaned_data").mkdir(parents=True, exist_ok=True)


In [6]:
admissions.to_csv("cleaned_data/hospital_admissions_clean.csv", index=False)
