# Diabetes Dataset â€” Exploratory Data Analysis (EDA)

This notebook explores the diabetes dataset in `data/raw/Healthcare-Diabetes.csv` and produces clean, reproducible analysis outputs.

**Project goals**
- Validate data quality (types, missingness, duplicates, outliers)
- Perform exploratory analysis and visualization
- Produce a cleaned dataset in `data/processed/`
- (Optional) Train a baseline model to set a performance benchmark

> Tip: If you publish this repo publicly, ensure your dataset can be shared.


In [None]:
import pandas as pd
import numpy as np

from src.config import Paths
from src.data_load import load_csv, save_csv
from src.cleaning import treat_zeros_as_missing, impute_numeric_median
import src.viz as viz

paths = Paths()
raw_path = paths.data_raw / "Healthcare-Diabetes.csv"
df = load_csv(raw_path)

df.head()


In [None]:
# Basic overview
display(df.shape)
display(df.dtypes)
display(df.describe(include="all").T)


In [None]:
# Missingness checks (note: some columns may use 0 to mean missing)
missing = df.isna().sum().sort_values(ascending=False)
zeros = (df == 0).sum().sort_values(ascending=False)

display(missing)
display(zeros)


In [None]:
# Clean: treat 0 as missing for physiological measurements, then impute for EDA/modeling
df_clean = treat_zeros_as_missing(df)
df_imputed = impute_numeric_median(df_clean)

df_imputed.isna().sum().sort_values(ascending=False).head(10)


In [None]:
# Save cleaned dataset
processed_path = paths.data_processed / "diabetes_clean_imputed.csv"
save_csv(df_imputed, processed_path, index=False)

processed_path


In [None]:
# Outcome distribution
if "Outcome" in df.columns:
    viz.plot_outcome_distribution(df, "Outcome")
    viz.save_current_fig(paths.reports_figures / "outcome_distribution.png")


In [None]:
# Correlation heatmap (numeric)
viz.plot_corr_heatmap(df_imputed.select_dtypes(include=[np.number]), title="Correlation heatmap (imputed)")
viz.save_current_fig(paths.reports_figures / "correlation_heatmap.png")


In [None]:
# Feature distributions (zeros treated as missing)
for col in ["Glucose","BMI","Age","Insulin","BloodPressure","SkinThickness"]:
    if col in df_clean.columns:
        viz.plot_hist(df_clean, col, bins=30, title=f"{col} distribution (zeros treated as missing)")
        viz.save_current_fig(paths.reports_figures / f"{col.lower()}_hist.png")


In [None]:
# Group comparisons by Outcome (if present)
if "Outcome" in df_imputed.columns:
    numeric_cols = [c for c in df_imputed.select_dtypes(include=[np.number]).columns if c != "Outcome"]
    summary_by_outcome = df_imputed.groupby("Outcome")[numeric_cols].mean().T.sort_values(by=1, ascending=False)
    summary_by_outcome.head(15)


In [None]:
# Optional: baseline model (logistic regression)
from src.modeling import train_logistic_regression

if "Outcome" in df_imputed.columns:
    model_df = df_imputed.copy()
    model, report, auc = train_logistic_regression(model_df, target_col="Outcome")
    print(report)
    print("ROC AUC:", auc)
