### Jupyter-ready, fully reproducible pipeline that:

1. reads shen_demo.csv (Shenzhen metadata file) from the given path,

2. maps classes (Normal→0, TB→1),

3. verifies the class counts (expected: Normal=326, TB=336),

4. makes a stratified, deterministic 70/20/10 split per class (exact counts),

5. writes label_train.csv, label_valid.csv, label_test.csv (two columns, no header),

6. and validates no leakage plus prints per-split class counts.

The split is stratified per class with exact counts (using rounding for train/valid, remainder to test) for determinism and balance.

Output files are exactly two columns, no header: Filename,Label where Label ∈ {0 (normal), 1 (tuberculosis)}.

All checks are assertions; if anything is off (e.g., duplicates in input), the cell will raise with a clear message.

In [None]:
from __future__ import annotations

from pathlib import Path
from typing import Tuple, Dict
import pandas as pd

# ---------------- Config ----------------
SOURCE_CSV = Path("/shen_demo.csv")
OUT_DIR    = Path("/dataset")
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_CSV = OUT_DIR / "label_train.csv"
VALID_CSV = OUT_DIR / "label_valid.csv"
TEST_CSV  = OUT_DIR / "label_test.csv"

SEED = 2025  # deterministic

CLASS_MAP: Dict[str, int] = {
    "normal": 0,
    "tb": 1,
}

EXPECTED_NORMAL = 326
EXPECTED_TB = 336

# --------------- Helpers ---------------
def _load_and_map(source_csv: Path) -> pd.DataFrame:
    """Load shen_demo.csv, keep at least [Filename, Class], map Class→Label (0/1)."""
    df = pd.read_csv(source_csv)
    # sanity
    if "Filename" not in df.columns or "Class" not in df.columns:
        raise ValueError("Input CSV must contain columns: 'Filename' and 'Class'.")

    # normalize class and map
    cls_norm = df["Class"].astype(str).str.strip().str.lower()
    if not cls_norm.isin(CLASS_MAP.keys()).all():
        bad = sorted(set(cls_norm.tolist()) - set(CLASS_MAP.keys()))
        raise ValueError(f"Found unknown class tokens: {bad}. Expected one of {list(CLASS_MAP)}")

    df = df.copy()
    df["Label"] = cls_norm.map(CLASS_MAP).astype(int)

    # filename sanity
    if df["Filename"].isna().any():
        raise ValueError("Found missing Filename values.")
    if df["Filename"].duplicated().any():
        dups = df[df["Filename"].duplicated()]["Filename"].tolist()[:10]
        raise ValueError(f"Found duplicate Filenames (first 10): {dups}")

    return df[["Filename", "Label"]]

def _compute_target_counts(n: int, ratios: Tuple[float, float, float]) -> Tuple[int, int, int]:
    """Exact counts per class using round for train/valid, remainder to test."""
    r_train, r_valid, r_test = ratios
    if abs(r_train + r_valid + r_test - 1.0) > 1e-8:
        raise ValueError("Ratios must sum to 1.0")
    n_train = int(round(r_train * n))
    n_valid = int(round(r_valid * n))
    n_test = n - n_train - n_valid  # remainder
    # Correct any rounding drift by nudging toward the largest remainder bucket if needed
    if n_train < 0 or n_valid < 0 or n_test < 0:
        raise ValueError(f"Negative split sizes computed: {(n_train, n_valid, n_test)} for n={n}")
    return n_train, n_valid, n_test

def _stratified_split_exact(df: pd.DataFrame, ratios=(0.7, 0.2, 0.1)) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Exact 70/20/10 per-class split (deterministic):
      - shuffle each class deterministically
      - slice exact counts per class
    """
    train_parts = []
    valid_parts = []
    test_parts = []

    for label, grp in df.groupby("Label", sort=True):
        grp_shuf = grp.sample(frac=1.0, random_state=SEED).reset_index(drop=True)
        n = len(grp_shuf)
        n_train, n_valid, n_test = _compute_target_counts(n, ratios)

        train_parts.append(grp_shuf.iloc[:n_train])
        valid_parts.append(grp_shuf.iloc[n_train:n_train + n_valid])
        test_parts.append(grp_shuf.iloc[n_train + n_valid:n_train + n_valid + n_test])

    train_df = pd.concat(train_parts, axis=0).sample(frac=1.0, random_state=SEED).reset_index(drop=True)
    valid_df = pd.concat(valid_parts, axis=0).sample(frac=1.0, random_state=SEED).reset_index(drop=True)
    test_df = pd.concat(test_parts,  axis=0).sample(frac=1.0, random_state=SEED).reset_index(drop=True)

    # leakage checks
    s_tr = set(train_df["Filename"])
    s_va = set(valid_df["Filename"])
    s_te = set(test_df["Filename"])

    assert s_tr.isdisjoint(s_va), "Leakage: train ∩ valid is non-empty"
    assert s_tr.isdisjoint(s_te), "Leakage: train ∩ test is non-empty"
    assert s_va.isdisjoint(s_te), "Leakage: valid ∩ test is non-empty"

    # coverage check
    all_src = set(df["Filename"])
    all_out = s_tr | s_va | s_te
    assert all_src == all_out, "Mismatch: union(train,valid,test) != all input Filenames"

    return train_df, valid_df, test_df

def _save_label_csv(df: pd.DataFrame, path: Path) -> None:
    """Save as two columns [Filename, Label], no header, no index."""
    df[["Filename", "Label"]].to_csv(path, index=False, header=False)

def _print_counts_from_csv(path: Path) -> None:
    df = pd.read_csv(path, header=None, names=["Filename", "Label"])
    counts = df["Label"].value_counts().sort_index()
    n0 = int(counts.get(0, 0))
    n1 = int(counts.get(1, 0))
    print(f"{path.name}: total={len(df)} | normal(0)={n0} | tuberculosis(1)={n1}")

# run

# 1) Load & map
df_all = _load_and_map(SOURCE_CSV)

# 2) Verify class counts (and show actuals)
n_total = len(df_all)
n_normal = int((df_all["Label"] == 0).sum())
n_tb = int((df_all["Label"] == 1).sum())
print(f"Total={n_total} | Normal={n_normal} | TB={n_tb}")

# Check against the expected numbers you quoted
if n_normal == EXPECTED_NORMAL and n_tb == EXPECTED_TB:
    print("✅ Class counts match expected: Normal=326, TB=336.")
else:
    print(f"⚠️ Class counts differ from expected (Normal={EXPECTED_NORMAL}, TB={EXPECTED_TB}). "
          "Proceeding with computed counts above.")

# 3) Stratified exact split (70/20/10 per class)
train_df, valid_df, test_df = _stratified_split_exact(df_all, ratios=(0.7, 0.2, 0.1))

# 4) Save CSVs (no header)
_save_label_csv(train_df, TRAIN_CSV)
_save_label_csv(valid_df, VALID_CSV)
_save_label_csv(test_df,  TEST_CSV)

print(f"Saved:\n- {TRAIN_CSV}\n- {VALID_CSV}\n- {TEST_CSV}")

# 5) Final verification: read back and print per-split class counts
_print_counts_from_csv(TRAIN_CSV)
_print_counts_from_csv(VALID_CSV)
_print_counts_from_csv(TEST_CSV)

# 6) Extra leakage check from files (defensive)
df_tr = pd.read_csv(TRAIN_CSV, header=None, names=["Filename","Label"])
df_va = pd.read_csv(VALID_CSV, header=None, names=["Filename","Label"])
df_te = pd.read_csv(TEST_CSV,  header=None, names=["Filename","Label"])

st, sv, se = set(df_tr["Filename"]), set(df_va["Filename"]), set(df_te["Filename"])
assert st.isdisjoint(sv), "Leakage after save: train ∩ valid non-empty"
assert st.isdisjoint(se), "Leakage after save: train ∩ test non-empty"
assert sv.isdisjoint(se), "Leakage after save: valid ∩ test non-empty"
print("✅ No filename leakage across train/valid/test.")


### Expected output

Total=662 | Normal=326 | TB=336

✅ Class counts match expected: Normal=326, TB=336.

Saved:
- /dataset/label_train.csv

- /dataset/label_valid.csv

- /dataset/label_test.csv

label_train.csv: total=463 | normal(0)=228 | tuberculosis(1)=235

label_valid.csv: total=132 | normal(0)=65 | tuberculosis(1)=67

label_test.csv: total=67 | normal(0)=33 | tuberculosis(1)=34

✅ No filename leakage across train/valid/test.


## END OF CODE