In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from src.data import normalize_column_names

# 1. Load and Inspect the Data

### Load the data

In [None]:
df = pd.read_csv("../data/raw/diabetic_data.csv")
df = normalize_column_names(df)

### Basic info

In [None]:
print(df.shape)

In [None]:
df.head().transpose()

In [None]:
df.info()

In [None]:
df.describe(include="all").transpose()

# 2. Missing Values Analysis

### Missing value counts

In [None]:
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({"Missing Count": missing, "Missing %": missing_pct})
missing_df = missing_df[missing_df["Missing Count"] > 0].sort_values(
    by="Missing %", ascending=False
)

missing_df

# 3. Target Variable Distribution

In [None]:
df["readmitted_30_days"] = df["readmitted"].apply(lambda x: 1 if x == "<30" else 0)

### Class balance

In [None]:
sns.countplot(x="readmitted_30_days", data=df)
plt.title("Readmitted Within 30 Days (Target Variable)")
df["readmitted_30_days"].value_counts(normalize=True)

# 4. Demographics

### Age bins

In [None]:
df["age"].value_counts().sort_index().plot(kind="bar", title="Age Distribution")

### Gender

In [None]:
df["gender"].value_counts(dropna=False).plot(kind="bar", title="Gender Distribution")

### Race

In [None]:
df["race"].value_counts(dropna=False).plot(kind="bar", title="Race Distribution")

# 5. Admission & Discharge Details

### Admission type

In [None]:
df["admission_type_id"].value_counts().plot(kind="bar", title="Admission Type")

### Discharge disposition

In [None]:
df["discharge_disposition_id"].value_counts().head(10).plot(
    kind="bar", title="Top 10 Discharge Dispositions"
)

### Admission source

In [None]:
df["admission_source_id"].value_counts().head(10).plot(
    kind="bar", title="Top 10 Admission Sources"
)

# 6. Hospitalization History

### Time in hospital

In [None]:
sns.histplot(df["time_in_hospital"], kde=False, bins=14)

### Number of diagnoses

In [None]:
sns.histplot(df["number_diagnoses"], bins=15)

### Prior utilization

In [None]:
util_cols = ["number_outpatient", "number_emergency", "number_inpatient"]
df[util_cols].hist(bins=20, figsize=(12, 6))

# 7. Clinical Features

### Number of lab procedures, medications, procedures

In [None]:
clinical_cols = ["num_lab_procedures", "num_medications", "num_procedures"]
df[clinical_cols].hist(bins=20, figsize=(12, 6))

### Glucose and A1C results

In [None]:
df["max_glu_serum"].value_counts()

In [None]:
df["a1cresult"].value_counts()

# 8. Medications

In [None]:
med_cols = [
    "metformin",
    "repaglinide",
    "nateglinide",
    "chlorpropamide",
    "glimepiride",
    "acetohexamide",
    "glipizide",
    "glyburide",
    "tolbutamide",
    "pioglitazone",
    "rosiglitazone",
    "acarbose",
    "miglitol",
    "troglitazone",
    "tolazamide",
    "examide",
    "citoglipton",
    "insulin",
    "glyburide-metformin",
    "glipizide-metformin",
    "glimepiride-pioglitazone",
    "metformin-rosiglitazone",
    "metformin-pioglitazone",
]
df[med_cols].apply(pd.Series.value_counts).transpose()

# 9. Missing Data & Unknowns

In [None]:
# Missing counts (includes '?')
missing_cols = df.columns[df.isin(["?"]).any()]
for col in missing_cols:
    missing_pct = (df[col] == "?").mean() * 100
    print(f"{col}: {missing_pct:.2f}% '?' values")

# 10. Correlation & Feature Interactions

### Correlation matrix

In [None]:
numeric_cols = df.select_dtypes(include="number")
corr = numeric_cols.corr()

sns.heatmap(corr, cmap="coolwarm", annot=False, vmax=1, vmin=-1)