## Objectives
This notebook performs the initial inspection and cleaning of the
*Dataset With / Without Blood Glucose Values*.

Key goals:
- Understand dataset structure and variable types
- Quantify missingness without imputing prematurely
- Identify data quality issues (duplicates, invalid values)
- Preserve clinically meaningful missingness
- Output a clean baseline dataset for downstream analysis

In [None]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:.3f}".format)

In [None]:
DATA_PATH = "../data/raw/Attachment_1_Test_Data_with_Blood_Glucose_Values.csv"

df = pd.read_csv(DATA_PATH)

In [None]:
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
      .str.replace(r"[^\w_]", "", regex=True)
)

In [None]:
duplicate_rows = df.duplicated().sum()
print(f"Duplicate rows: {duplicate_rows}")

In [None]:
if duplicate_rows > 0:
    df = df.drop_duplicates()

In [None]:
missing_summary = (
    df.isna()
      .sum()
      .to_frame("missing_count")
      .assign(missing_pct=lambda x: x["missing_count"] / len(df) * 100)
      .sort_values("missing_pct", ascending=False)
)

missing_summary

In [None]:
glucose_columns = [
    col for col in df.columns
    if "glucose" in col or "glu" in col
]

glucose_columns

In [None]:
df["has_glucose_measurement"] = (
    df[glucose_columns]
    .notna()
    .any(axis=1)
    .astype(int)
)

In [None]:
df["has_glucose_measurement"].value_counts(normalize=True)

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns

invalid_counts = {
    col: (df[col] < 0).sum()
    for col in numeric_cols
}

invalid_counts = {
    k: v for k, v in invalid_counts.items() if v > 0
}

invalid_counts

In [None]:
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [None]:
OUTPUT_PATH = "../data/processed/clean_baseline.csv"
df.to_csv(OUTPUT_PATH, index=False)