## Notebook Objectives

1. Load and inspect the dataset
2. Understand schema and variable types
3. Identify missing values and inconsistencies
4. Clean and standardize columns
5. Handle outliers and invalid records
6. Save a cleaned dataset for downstream analysis

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:,.2f}".format)

In [None]:
DATA_PATH = Path("../data/raw/hospital_inpatient_discharges.csv")

df = pd.read_csv(DATA_PATH)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
      .str.replace("-", "_")
)

In [None]:
df.columns.tolist()

In [None]:
missing_summary = df.isna().sum().sort_values(ascending=False)
missing_summary

In [None]:
missing_pct = (df.isna().mean() * 100).round(2)
pd.concat([missing_summary, missing_pct], axis=1,
          keys=["missing_count", "missing_percent"])

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
numeric_columns = [
    "length_of_stay",
    "total_charges"
]

for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

In [None]:
df[numeric_columns].describe()

In [None]:
if "length_of_stay" in df.columns:
    df["length_of_stay"].describe()

In [None]:
invalid_los = df[df["length_of_stay"] <= 0]
invalid_los.shape

In [None]:
df = df[df["length_of_stay"] > 0]

In [None]:
if "total_charges" in df.columns:
    df["total_charges"].describe()

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(x=df["total_charges"])
plt.title("Total Charges â€“ Outlier Inspection")
plt.show()

In [None]:
Q1 = df["total_charges"].quantile(0.25)
Q3 = df["total_charges"].quantile(0.75)
IQR = Q3 - Q1

upper_bound = Q3 + 1.5 * IQR

df["high_charge_outlier"] = df["total_charges"] > upper_bound
df["high_charge_outlier"].value_counts()

In [None]:
categorical_cols = df.select_dtypes(include="object").columns

for col in categorical_cols:
    df[col] = df[col].str.strip()

In [None]:
df.info()

In [None]:
df.describe(include="all").transpose()

In [None]:
OUTPUT_PATH = Path("../data/processed/hospital_inpatient_discharges_cleaned.csv")
df.to_csv(OUTPUT_PATH, index=False)

In [None]:
print(f"Cleaned dataset saved to: {OUTPUT_PATH}")

##Outputs of This Notebook

* Standardized column names
* Corrected numeric data types
* Removed invalid LOS values
* Flagged charge outliers
* Removed duplicates
* Saved cleaned dataset for analysis notebooks