## Objectives
This notebook investigates missing data patterns with a focus on
blood glucose measurements.

Key goals:
- Quantify and visualize missingness
- Determine whether missingness is random or systematic
- Define clinically meaningful cohorts:
  - Patients WITH glucose measurements
  - Patients WITHOUT glucose measurements
- Establish analytical implications for modeling and bias

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

pd.set_option("display.max_columns", None)

In [None]:
DATA_PATH = "../data/processed/clean_baseline.csv"
df = pd.read_csv(DATA_PATH)

df.shape

In [None]:
missing_summary = (
    df.isna()
      .sum()
      .to_frame("missing_count")
      .assign(missing_pct=lambda x: x["missing_count"] / len(df) * 100)
      .sort_values("missing_pct", ascending=False)
)

missing_summary.head(15)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(
    x=missing_summary["missing_pct"],
    y=missing_summary.index,
    color="steelblue"
)
plt.xlabel("Percentage Missing")
plt.ylabel("Variable")
plt.title("Missingness Percentage by Variable")
plt.tight_layout()
plt.show()

In [None]:
msno.matrix(df, figsize=(12, 6))
plt.title("Missingness Matrix")
plt.show()

In [None]:
msno.heatmap(df, figsize=(10, 6))
plt.title("Correlation of Missingness Between Variables")
plt.show()

In [None]:
glucose_columns = [
    col for col in df.columns
    if "glucose" in col or "glu" in col
]

glucose_columns

In [None]:
df["has_glucose_measurement"].value_counts()

In [None]:
df["has_glucose_measurement"].value_counts(normalize=True)

In [None]:
with_glucose = df[df["has_glucose_measurement"] == 1]
without_glucose = df[df["has_glucose_measurement"] == 0]

print(f"With glucose: {with_glucose.shape[0]} records")
print(f"Without glucose: {without_glucose.shape[0]} records")

In [None]:
cohort_missingness = pd.DataFrame({
    "with_glucose_missing_pct": with_glucose.isna().mean() * 100,
    "without_glucose_missing_pct": without_glucose.isna().mean() * 100
})

cohort_missingness.sort_values(
    "with_glucose_missing_pct", ascending=False
).head(15)

In [None]:
df["missing_count_per_row"] = df.isna().sum(axis=1)

plt.figure(figsize=(8, 5))
sns.boxplot(
    x="has_glucose_measurement",
    y="missing_count_per_row",
    data=df
)
plt.xlabel("Has Glucose Measurement")
plt.ylabel("Number of Missing Values")
plt.title("Missing Value Burden by Cohort")
plt.show()

In [None]:
from scipy.stats import mannwhitneyu

stat, p_value = mannwhitneyu(
    with_glucose["missing_count_per_row"],
    without_glucose["missing_count_per_row"],
    alternative="two-sided"
)

p_value