# 10 - Advanced Sample Size and Cluster Analysis

**Purpose**: This notebook performs a detailed sample size analysis, extending the calculations to account for clustered data. It calculates the required number of individual subjects () and the required number of clusters () to achieve a desired precision for sensitivity and specificity.

**Inputs**:
- Pre-computed threshold data from , which contains sensitivity and specificity values at various Z-score thresholds.

**Outputs**:
- : A comprehensive Excel workbook with multiple sheets detailing:
    - Sensitivity and Specificity at different Z-Thresholds.
    - Required sample size (N) for different CI half-widths.
    - Required number of clusters (K) for different ICC values and mean subjects per cluster.

### 10.1 Setup, Calculation Functions, and Data Generation

This cell contains the complete logic for the sample size and cluster analysis...


In [None]:
%reload_ext autoreload
%autoreload 2

from pathlib import Path
import math
from scipy.stats import norm


from polars import DataFrame
import polars as pl
from xlsxwriter import Workbook

from early_markers.cribsy.common.thresholds import (
    thresh_emcp,
    thresh_emcp_yt,
    thresh_yt,
)


ROOT_DIR = Path("/Volumes/secure/data/early_markers/cribsy")
XLSX_DIR = ROOT_DIR / "xlsx"

# TEST_PCT = 0.25
# METRIC_RANGE = [0.75, 0.8, 0.85, 0.9, 1.0]
HW_RANGE = [0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20, 0.225]
THRESH_RANGE = [-1.6, -1.4, -1.2, -1.0, -0.8, -0.6, -0.4]
ICC_RANGE = [0.8, 0.85, 0.9, 0.95, 0.99]
CLUSTER_RANGE = [1.0, 1.25, 1.5, 1.75, 2.0]

THRESH_MAP = {
    "emcp": thresh_emcp,
    "yt": thresh_yt,
    "emcp_yt": thresh_emcp_yt,
}

LABEL_MAP = {
    "emcp": "EMCP",
    "yt": "Youtube",
    "emcp_yt": "EMCP & Youtube",
    "sensitivity": "Sensitivity",
    "specificity": "Specificity",
}

COL_MAP_THRESH = {
    "threshold": "Z-Threshold",
    "sensitivity": "Sensitivity",
    "sens_ci": "Se 95% CI",
    # "sens_ci_lb": "Se 95% CI LB",
    # "sens_ci_ub": "Se 95% CI UB",
    "sens_ci_half_width": "Se ½-Width",
    "specificity": "Specificity",
    "spec_ci": "Sp 95% CI",
    # "spec_ci_lb": "Sp 95% CI LB",
    # "spec_ci_ub": "Sp 95% CI UB",
    "spec_ci_half_width": "Sp ½-Width",
}
#
# COL_MAP_SENS = {
#     "sensitivity": "Sensitivity",
#     "sens_ci_lb": "Se 95% CI LB",
#     "sens_ci_ub": "Se 95% CI UB",
#     "sens_ci_half_width": "Se 95% CI ½-Width",
#     "n": "N",
# }
#
# COL_MAP_SPEC = {
#     "specificity": "Specificity",
#     "spec_ci_lb": "Sp 95% CI LB",
#     "spec_ci_ub": "Sp 95% CI UB",
#     "spec_ci_half_width": "Sp 95% CI ½-Width",
#     "n": "N",
# }

COL_MAP_SAMPLE_SIZE = {
    'threshold': "Z-Threshold",
    'half_width': "95% CI Half_Width",
    'sensitivity': "Sensitivity",
    "sens_ci": "Se 95% CI",
    'specificity': "Specificity",
    "spec_ci": "Sp 95% CI",
    'n_sens': "N Sensitivity",
    'n_spec': "N Specificity",
    'n_train': "N Train",
    'n_test': "N Test",
    'n_study': "N Study",
}

COL_MAP_CLUSTER_SIZE = {
    'threshold': "Z-Threshold",
    'half_width': "95% CI Half_Width",
    "mean_n_per_k": "Mean N per K",
    "icc": "ICC",
    'sensitivity': "Sensitivity",
    "sens_ci": "Se 95% CI",
    'specificity': "Specificity",
    "spec_ci": "Sp 95% CI",
    'k_sens': "K Sensitivity",
    'k_spec': "K Specificity",
    'k_train': "K Train",
    'k_test': "K Test",
    'k_study': "K Study",
}

DF_THRESH_MAP = {
    label: DataFrame(thresh).with_columns(
        sens_ci_lb_width = pl.col("sensitivity") - pl.col("sens_ci_lb"),
        sens_ci_ub_width = pl.col("sens_ci_ub") - pl.col("sensitivity"),
        spec_ci_lb_width = pl.col("specificity") - pl.col("spec_ci_lb"),
        spec_ci_ub_width = pl.col("spec_ci_ub") - pl.col("specificity"),

    ).with_columns(
        sens_ci_half_width = pl.max_horizontal(pl.col("sens_ci_lb_width"), pl.col("sens_ci_ub_width")),
        spec_ci_half_width = pl.max_horizontal(pl.col("spec_ci_lb_width"), pl.col("spec_ci_ub_width")),
    ).select(
        ["threshold", "sensitivity", "sens_ci", "sens_ci_half_width", "specificity", "spec_ci", "spec_ci_half_width"]
    )
    for label, thresh in THRESH_MAP.items()
}


def n_for_proportion(proportion: float, half_width: float, confidence_level: float) -> int:
    # Z-score for 95% confidence level
    alpha = 1 - confidence_level
    z = norm.ppf(1 - alpha / 2)
    # Estimated proportion and its complement
    p = proportion
    q = 1 - p
    # Formula to calculate sample size
    n = (z**2 * p * q) / (half_width**2)
    # Return the ceiling of the calculated sample size
    return math.ceil(n)

def k_for_proportion(proportion: float, half_width: float, confidence_level: float, icc: float, mean_n_per_k: float):
    # Z-score for 95% confidence level
    alpha = 1 - confidence_level
    z = norm.ppf(1 - alpha / 2)
    # Estimated proportion and its complement
    p = proportion
    q = 1 - p
    # Variance of sensitivity
    variance = p * q
    # Design effect for clustering
    design_effect = 1 + (mean_n_per_k - 1) * icc
    # Adjusted variance for clustering
    adjusted_variance = variance * design_effect
    # Formula to calculate number of clusters
    num_clusters = (z**2 * adjusted_variance) / (half_width**2)
    # Return the ceiling of the calculated number of clusters
    return math.ceil(num_clusters)

SAMPLE_MAP = {}
for model, df_thresh in DF_THRESH_MAP.items():
    SAMPLE_MAP[model] = {}
    for metric in ["sensitivity", "specificity"]:
        SAMPLE_MAP[model][metric] = map = []
        for thresh in THRESH_RANGE:
            for half_width in HW_RANGE:
                val = df_thresh.filter(pl.col("threshold") == thresh).select(metric).item()
                train = n_for_proportion(val, half_width, 0.95)
                test = train
                study = train + test
                map.append(
                    {
                        "model": model,
                        "metric": metric,
                        "z": thresh,
                        "estimate": val,
                        "half_width": half_width,
                        "lb": max(0.0, val - half_width),
                        "ub": min(1.0, val + half_width),
                        "n_train": train,
                        "n_test": test,
                        "n_study": study,
                    }
                )

DF_METRIC_SAMPLE_MAP = {
    f"{model}_{metric}": DataFrame(rows)
    for model, metric_map in SAMPLE_MAP.items()
    for metric, rows in metric_map.items()
}

DF_SAMPLE_SIZE_MAP = {}
for model in THRESH_MAP.keys():
    data = []
    for z in THRESH_RANGE:
        for half_width in HW_RANGE:
            df_sens = DF_METRIC_SAMPLE_MAP[f"{model}_sensitivity"].filter(pl.col("z") == z, pl.col("half_width") == half_width).select(["z", "estimate", "lb", "ub", "n_train", "n_test", "n_study"])
            df_spec = DF_METRIC_SAMPLE_MAP[f"{model}_specificity"].filter(pl.col("z") == z, pl.col("half_width") == half_width).select(["z", "estimate", "lb", "ub", "n_train", "n_test", "n_study"])
            train = max(df_sens.select("n_train").item(), df_spec.select("n_train").item())
            test = train
            study = train + test
            data.append(
                {
                    "threshold": z,
                    "half_width": half_width,
                    "sensitivity": df_sens.select("estimate").item(),
                    "sens_ci": f"[{df_sens.select("lb").item():.3f}, {df_sens.select("ub").item():.3f}]",
                    "specificity": df_spec.select("estimate").item(),
                    "spec_ci": f"[{df_spec.select("lb").item():.3f}, {df_spec.select("ub").item():.3f}]",
                    "n_sens": df_sens.select("n_train").item(),
                    "n_spec": df_spec.select("n_train").item(),
                    "n_train": train,
                    "n_test": test,
                    "n_study": study,
                }
            )
    DF_SAMPLE_SIZE_MAP[model] = DataFrame(data).sort("threshold", "half_width", descending=[True, False]).rename(COL_MAP_SAMPLE_SIZE)

CLUSTER_MAP = {}
for model, df_thresh in DF_THRESH_MAP.items():
    CLUSTER_MAP[model] = {}
    for metric in ["sensitivity", "specificity"]:
        CLUSTER_MAP[model][metric] = map = []
        for thresh in THRESH_RANGE:
            for half_width in HW_RANGE:
                for cluster in CLUSTER_RANGE:
                    for icc in ICC_RANGE:
                        val = df_thresh.filter(pl.col("threshold") == thresh).select(metric).item()
                        train = k_for_proportion(val, half_width, 0.95, icc, cluster)
                        test = train
                        study = train + test
                        map.append(
                            {
                                "model": model,
                                "metric": metric,
                                "z": thresh,
                                "mean_n_per_k": cluster,
                                "icc": icc,
                                "estimate": val,
                                "half_width": half_width,
                                "lb": max(0.0, val - half_width),
                                "ub": min(1.0, val + half_width),
                                "k_train": train,
                                "k_test": test,
                                "k_study": study,
                            }
                        )

DF_METRIC_CLUSTER_MAP = {
    f"{model}_{metric}": DataFrame(rows)
    for model, metric_map in CLUSTER_MAP.items()
    for metric, rows in metric_map.items()
}

DF_CLUSTER_SIZE_MAP = {}
for model in THRESH_MAP.keys():
    data = []
    for z in THRESH_RANGE:
        for mean_n_per_k in CLUSTER_RANGE:
            for icc in ICC_RANGE:
                for half_width in HW_RANGE:
                    df_sens = DF_METRIC_CLUSTER_MAP[f"{model}_sensitivity"].filter(pl.col("z") == z, pl.col("mean_n_per_k") == mean_n_per_k, pl.col("icc") == icc, pl.col("half_width") == half_width).select(["z", "mean_n_per_k", "icc", "estimate", "lb", "ub", "k_train", "k_test", "k_study"])
                    df_spec = DF_METRIC_CLUSTER_MAP[f"{model}_specificity"].filter(pl.col("z") == z, pl.col("mean_n_per_k") == mean_n_per_k, pl.col("icc") == icc, pl.col("half_width") == half_width).select(["z", "mean_n_per_k", "icc", "estimate", "lb", "ub", "k_train", "k_test", "k_study"])
                    train = max(df_sens.select("k_train").item(), df_spec.select("k_train").item())
                    test = train
                    study = train + test
                    data.append(
                        {
                            "threshold": z,
                            "mean_n_per_k": df_sens.select("mean_n_per_k").item(),
                            "icc": df_sens.select("icc").item(),
                            "half_width": half_width,
                            "sensitivity": df_sens.select("estimate").item(),
                            "sens_ci": f"[{df_sens.select("lb").item():.3f}, {df_sens.select("ub").item():.3f}]",
                            "specificity": df_spec.select("estimate").item(),
                            "spec_ci": f"[{df_spec.select("lb").item():.3f}, {df_spec.select("ub").item():.3f}]",
                            "k_sens": df_sens.select("k_train").item(),
                            "k_spec": df_spec.select("k_train").item(),
                            "k_train": train,
                            "k_test": test,
                            "k_study": study,
                        }
                    )
    DF_CLUSTER_SIZE_MAP[model] = DataFrame(data).sort("threshold", "mean_n_per_k", "icc", "half_width", descending=[True, False, False, False]).rename(COL_MAP_CLUSTER_SIZE)


def set_workbook_formats(wb: Workbook) -> dict:
    DK_BLUE = "#4F81BD"
    MD_BLUE = "#95B3D7"
    LT_BLUE = "#DCE6F1"

    bold_fmt = wb.add_format({"bold": True})

    head_fmt = wb.add_format({"bold": True, "font_size": 13})
    head_fmt.set_align("left")

    hdr_fmt = wb.add_format({"bold": True, "font_color": "white"})
    hdr_fmt.set_align("center")
    hdr_fmt.set_bg_color(DK_BLUE)

    code_fmt = wb.add_format({"bold": True})
    code_fmt.set_bg_color(MD_BLUE)

    desc_fmt = wb.add_format()
    desc_fmt.set_bg_color(LT_BLUE)

    pct_fmt = wb.add_format({"num_format": "0.00%"})
    blue_pct_fmt = wb.add_format({"num_format": "0.00%"})
    blue_pct_fmt.set_bg_color(LT_BLUE)

    return {"bold": bold_fmt, "heading": head_fmt, "header": hdr_fmt, "code": code_fmt, "desc": desc_fmt, "pct": pct_fmt, "blue_pct": blue_pct_fmt}


### 10.2 Generate Excel Report

This final cell takes all the generated DataFrames and writes them into a multi-sheet Excel workbook...


In [None]:
wb = Workbook(XLSX_DIR / "cribsy_sample_size.xlsx")
formats = set_workbook_formats(wb)

ws = wb.add_worksheet("Z-Thresholds")
row = 0
col = 0
for key, df in DF_THRESH_MAP.items():
    ws.merge_range(row, col, row, col+4, f"Sensitivity & Specificity With 95% CI for {LABEL_MAP[key]}", formats["heading"])
    row += 1
    df.rename(COL_MAP_THRESH).write_excel(
        wb, ws, position=(row, col),
        table_style="Table Style Medium 9",
        autofit=True,
        column_formats={
            "Z-Threshold": {"bold": True},
            "Sensitivity": {"bold": True},
            "Specificity": {"bold": True},
        },
        conditional_formats={
            ("Sensitivity", "Specificity"): {
                "type": "3_color_scale"
            }
        }
    )
    row = row + df.height + 3

ws.set_column(0, 6, 15)

for model, df in DF_SAMPLE_SIZE_MAP.items():
    ws = wb.add_worksheet(f"N - {LABEL_MAP[model]}")
    row = 0
    col = 0
    ws.merge_range(row, col, row, col+4, f"Sample Size Estimates by Z-Threshold and 95% CI Half-Width [{LABEL_MAP[model]}]", formats["heading"])
    row += 1
    df.write_excel(
        wb, ws, position=(row, col),
        table_style="Table Style Medium 9",
        autofit=True,
        column_formats={
            "Z-Threshold": {"bold": True},
            "Sensitivity": {"bold": True},
            "Specificity": {"bold": True},
            "N Study": {"bold": True},
        },
        conditional_formats={
            ("Sensitivity", "Specificity"): {
                "type": "3_color_scale"
            },
            "N Study": {
                "type": "3_color_scale",
                "max_color": "#f8696b",
                "mid_color": "#ffeb83",
                "min_color": "#63be7b",
            }
        },
    )
    ws.set_column(0, 0, 15)

for model, df in DF_CLUSTER_SIZE_MAP.items():
    ws = wb.add_worksheet(f"K - {LABEL_MAP[model]}")
    row = 0
    col = 0
    ws.merge_range(row, col, row, col+5, f"Clustered Sample Size Estimates by Z-Threshold, Mean N/K, ICC and 95% CI Half-Width [{LABEL_MAP[model]}]", formats["heading"])
    row += 1
    df.write_excel(
        wb, ws, position=(row, col),
        table_style="Table Style Medium 9",
        autofit=True,
        column_formats={
            "Z-Threshold": {"bold": True},
            "Sensitivity": {"bold": True},
            "Specificity": {"bold": True},
        },
        conditional_formats={
            ("Sensitivity", "Specificity"): {
                "type": "3_color_scale"
            }
        }
    )
    ws.set_column(0, 0, 15)
    ws.freeze_panes(2, 0)
wb.close()