## Objectives
This notebook explores clinical and laboratory characteristics of patients
with and without blood glucose measurements.

Key goals:
- Compare distributions of key numeric variables
- Identify statistically significant cohort differences
- Explore correlations among metabolic and hematologic markers
- Establish whether glucose-tested patients are clinically distinct

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import mannwhitneyu

pd.set_option("display.max_columns", None)

In [None]:
DATA_PATH = "../data/processed/clean_baseline.csv"
df = pd.read_csv(DATA_PATH)

with_glucose = df[df["has_glucose_measurement"] == 1]
without_glucose = df[df["has_glucose_measurement"] == 0]

print(f"With glucose: {with_glucose.shape[0]}")
print(f"Without glucose: {without_glucose.shape[0]}")

In [None]:
exclude_cols = ["has_glucose_measurement", "missing_count_per_row"]
numeric_cols = [
    col for col in df.select_dtypes(include=np.number).columns
    if col not in exclude_cols
]

len(numeric_cols)

In [None]:
for col in numeric_cols[:10]:  # limit for readability
    plt.figure(figsize=(7, 4))
    sns.kdeplot(with_glucose[col], label="With Glucose", fill=True)
    sns.kdeplot(without_glucose[col], label="Without Glucose", fill=True)
    plt.title(f"Distribution of {col}")
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
results = []

for col in numeric_cols:
    x = with_glucose[col].dropna()
    y = without_glucose[col].dropna()

    if len(x) > 30 and len(y) > 30:
        stat, p = mannwhitneyu(x, y, alternative="two-sided")
        results.append({
            "variable": col,
            "with_glucose_median": x.median(),
            "without_glucose_median": y.median(),
            "p_value": p
        })

stats_df = pd.DataFrame(results).sort_values("p_value")
stats_df.head(10)

In [None]:
stats_df["median_diff"] = (
    stats_df["with_glucose_median"] -
    stats_df["without_glucose_median"]
)

stats_df.sort_values("median_diff", ascending=False).head(10)

In [None]:
corr_cols = [
    col for col in numeric_cols
    if with_glucose[col].notna().mean() > 0.9
]

corr_matrix = with_glucose[corr_cols].corr()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(
    corr_matrix,
    cmap="coolwarm",
    center=0,
    square=True
)
plt.title("Correlation Matrix – With Glucose Cohort")
plt.tight_layout()
plt.show()

In [None]:
glucose_related = [
    col for col in corr_cols
    if "glu" in col or "glucose" in col
]

glucose_related

In [None]:
corr_matrix[glucose_related].sort_values(
    by=glucose_related[0],
    ascending=False
).head(10)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pca_data = with_glucose[corr_cols].dropna()

X_scaled = StandardScaler().fit_transform(pca_data)

pca = PCA(n_components=2)
components = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(components, columns=["PC1", "PC2"])

In [None]:
plt.figure(figsize=(7, 6))
plt.scatter(pca_df["PC1"], pca_df["PC2"], alpha=0.4)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA Projection – With Glucose Cohort")
plt.show()