In [1]:
# 第三问：检测策略优化代码（基于第二问的 NLME 结果）
# 要求：已存在变量 df（DataFrame）、fit（MixedLM 拟合结果）、bmi_cuts（分组界点）
# 输出：每个 BMI 组的最优策略（初测周、是否重测及重测周）、对应成功概率与期望成本
# 并把结果导出为 CSV： BMI_group_strategy.csv

import numpy as np
import pandas as pd
from scipy.stats import norm
from tqdm import tqdm

# -----------------------------
# 参数与准备（如需可以调整）
# -----------------------------
threshold = 0.04         # Y浓度判定阈值（题目给定）
GA_min_allowed = 10.0    # 最早检测周
GA_max_allowed = 25.0    # 最晚检测周
grid_step = 0.1          # GA 网格步长（越小计算越精细但更慢）
retest_delay_options = [1.0, 2.0]  # 若支持重测，重测在初测后经过的周数（可尝试多个）
# 成本权重（可以根据临床/竞赛题设修改）
c_test = 1.0     # 一次检测的直接成本（基准）
c_retest = 1.5   # 重测的额外成本（相对较高）
c_fail = 50.0    # 检测最终失败的“惩罚成本”（较大，促使提高成功率）

# 拟合模型参数（来自 fit）
beta = fit.params  # pandas Series（含 const, GA_num, 孕妇BMI 等）
# 如果你的模型列名不同，需要对应调整 exog 名称顺序
# 计算随机效应方差与残差方差
sigma_b2 = fit.cov_re.iloc[0,0] if fit.cov_re.shape[0] > 0 else 0.0
sigma_eps2 = fit.scale

# 用于概率计算的总方差（若未观测到个体 bi，可把 bi 的不确定性也一起考虑）
sigma_total = np.sqrt(sigma_b2 + sigma_eps2)

# -----------------------------
# 概率函数：给定 GA、BMI（及可选协变量），计算 P(Y >= threshold)
# 集成随机效应（bi）不确定性：假设 bi ~ N(0, sigma_b2)
# 若要对个体已估随机效应使用单点bi, 可传入 bi_est 参数
# -----------------------------
def success_prob_ga(bga, bmi, extra_cov=None, bi_est=None):
    """
    bga: 检测时点（周数，float）
    bmi: 孕妇 BMI 值（float）
    extra_cov: dict，包含其他协变量名->值（例如年龄、身高等），若模型没有这些系数可忽略
    bi_est: 若已知个体随机效应估计值，则传入，函数将只以 eps 的方差计算概率
    返回：P(Y >= threshold)
    """
    # 构造线性预测 mu = beta0 + beta_GA * GA + beta_BMI * BMI + sum(beta_other * cov)
    mu = 0.0
    # 常数项
    if 'const' in beta.index:
        mu += beta['const']
    # GA 列名称可能为 'GA_num' 或 'GA'，优先尝试 'GA_num'
    if 'GA_num' in beta.index:
        mu += beta['GA_num'] * bga
    elif 'GA' in beta.index:
        mu += beta['GA'] * bga
    else:
        raise ValueError("模型中未找到 GA 的系数名称，请确认 fit.params 中 GA 列名。")
    # BMI 列
    if '孕妇BMI' in beta.index:
        mu += beta['孕妇BMI'] * bmi
    elif 'BMI' in beta.index:
        mu += beta['BMI'] * bmi
    # 其他协变量（如果模型里存在）
    if extra_cov is not None:
        for k,v in extra_cov.items():
            if k in beta.index:
                mu += beta[k] * v
    # 若给定 bi_est，则只用残差 eps 的不确定性
    if bi_est is not None:
        mu += bi_est
        sigma = np.sqrt(sigma_eps2)
        z = (threshold - mu) / sigma
        return 1.0 - norm.cdf(z)
    # 否则同时对 bi 和 eps 积分（相当于 mu ~ N(mu_fixed, sigma_b2)）
    sigma = np.sqrt(sigma_b2 + sigma_eps2)
    z = (threshold - mu) / sigma
    return 1.0 - norm.cdf(z)

# -----------------------------
# 分组：依据 bmi_cuts 生成组区间与标签
# -----------------------------
k = len(bmi_cuts) - 1
group_labels = []
for i in range(k):
    group_labels.append(f"Group{i+1}")

# 以组为单位做优化（也可以选择对每个个体做优化并汇总）
results = []
GA_grid = np.arange(GA_min_allowed, GA_max_allowed + 1e-6, grid_step)

for i in range(k):
    lower = bmi_cuts[i]
    upper = bmi_cuts[i+1]
    mask = (df['孕妇BMI'] >= lower) & (df['孕妇BMI'] < upper)
    df_grp = df.loc[mask].copy()
    n_grp = len(df_grp)
    if n_grp == 0:
        # 空组跳过
        results.append({
            'BMI组': group_labels[i],
            'BMI下限': lower, 'BMI上限': upper,
            '样本量': 0,
            'best_strategy': None
        })
        continue

    # 我们对组内个体的期望概率采用对 BMI 组内所有个体平均（即对每个候选 GA 计算组平均成功概率）
    # 两种策略比较：单次测试（single）与 两阶段测试（initial + retest）
    best = {'cost': np.inf, 'strategy': None}

    # 预compute BMI and optional extra covariates for each individual in group
    bmi_vals = df_grp['孕妇BMI'].values
    # 如果你想把年龄、身高等也纳入，请确保 fit.params 中有对应系数并将这些列包含进 extra_covs
    extra_cov_names = [c for c in ['年龄','身高','体重'] if c in df_grp.columns and c in beta.index]

    # 单次测试枚举：选择 GA 使组内平均期望成本最小（一次检测的期望成本 = c_test + c_fail * P(failure)）
    for ga in GA_grid:
        # 组内每个人的成功概率（综合 bi 与 eps）
        probs = np.array([success_prob_ga(ga, b, extra_cov={k: row[k] for k in extra_cov_names} if extra_cov_names else None)
                          for b, row in zip(bmi_vals, df_grp.to_dict('records'))])
        # 单次测试：若成功则成本为 c_test；若失败则成本为 c_test + c_fail (也可以把补救成本考虑进去)
        # 期望成本 = c_test + P(failure) * c_fail
        p_success_grp = probs.mean()
        p_failure_grp = 1.0 - p_success_grp
        expected_cost_single = c_test + p_failure_grp * c_fail

        if expected_cost_single < best['cost']:
            best['cost'] = expected_cost_single
            best['strategy'] = {
                'type': 'single',
                'initial_GA': ga,
                'retest_after': None,
                'p_success': p_success_grp,
                'expected_cost': expected_cost_single
            }

    # 两阶段策略枚举（初测 ga0, 若失败则在 ga0 + d 重测一次，d 来自 retest_delay_options）
    for ga0 in GA_grid:
        for d in retest_delay_options:
            ga1 = ga0 + d
            if ga1 > GA_max_allowed:
                continue
            # 计算每个个体在 ga0 成功概率 p0，失败概率 q0 = 1-p0
            p0 = np.array([success_prob_ga(ga0, b, extra_cov={k: row[k] for k in extra_cov_names} if extra_cov_names else None)
                           for b,row in zip(bmi_vals, df_grp.to_dict('records'))])
            p1 = np.array([success_prob_ga(ga1, b, extra_cov={k: row[k] for k in extra_cov_names} if extra_cov_names else None)
                           for b,row in zip(bmi_vals, df_grp.to_dict('records'))])
            # 对个体来说，最终成功概率 = p0 + (1-p0)*p1
            p_success_each = p0 + (1-p0)*p1
            p_success_grp = p_success_each.mean()
            p_retest_grp = (1 - p0).mean()   # 平均需要重测的比例
            # 期望成本 = c_test（初测） + p_retest*c_retest + (1 - p_success)*c_fail
            expected_cost_two = c_test + p_retest_grp * c_retest + (1 - p_success_grp) * c_fail

            if expected_cost_two < best['cost']:
                best['cost'] = expected_cost_two
                best['strategy'] = {
                    'type': 'two_stage',
                    'initial_GA': ga0,
                    'retest_after': d,
                    'retest_GA': ga1,
                    'p_success': p_success_grp,
                    'p_retest': p_retest_grp,
                    'expected_cost': expected_cost_two
                }

    # 记录结果
    rec = {
        'BMI组': group_labels[i],
        'BMI下限': lower, 'BMI上限': upper,
        '样本量': n_grp,
        'best_strategy': best['strategy']
    }
    results.append(rec)

# -----------------------------
# 将结果整理为 DataFrame 并导出 CSV
# -----------------------------
rows = []
for r in results:
    strat = r['best_strategy']
    if strat is None:
        rows.append({
            'BMI组': r['BMI组'],
            'BMI下限': r['BMI下限'],
            'BMI上限': r['BMI上限'],
            '样本量': r['样本量'],
            '策略类型': None,
            '初测周(周)': None,
            '重测间隔(周)': None,
            '重测周(周)': None,
            '组平均成功率': None,
            '组平均重测率': None,
            '期望成本': None
        })
        continue
    rows.append({
        'BMI组': r['BMI组'],
        'BMI下限': r['BMI下限'],
        'BMI上限': r['BMI上限'],
        '样本量': r['样本量'],
        '策略类型': strat['type'],
        '初测周(周)': strat.get('initial_GA'),
        '重测间隔(周)': strat.get('retest_after'),
        '重测周(周)': strat.get('retest_GA'),
        '组平均成功率': strat.get('p_success'),
        '组平均重测率': strat.get('p_retest'),
        '期望成本': strat.get('expected_cost')
    })

df_strategy = pd.DataFrame(rows)
df_strategy.to_csv("BMI_group_strategy.csv", index=False, encoding='utf-8-sig')
print("BMI 组检测策略已导出到 BMI_group_strategy.csv")
df_strategy


NameError: name 'fit' is not defined

In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from scipy.optimize import minimize

# ===============================
# 1. 导入数据
# ===============================
df = pd.read_excel("male.xlsx")

# ===============================
# 2. 将孕周转成数值 GA_num
# ===============================
def ga_to_float(ga_val):
    try:
        if pd.isna(ga_val):
            return np.nan
        # 已经是数字
        if isinstance(ga_val, (int, float)):
            return float(ga_val)
        # 字符串包含 w/d
        if isinstance(ga_val, str) and 'w' in ga_val:
            w, d = ga_val.split('w')
            w = w.strip()
            d = d.strip().replace('+','').replace('d','0')
            return int(w) + int(d)/7
        # 其他字符串尝试直接转浮点数
        return float(ga_val)
    except:
        return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)

# 删除缺失关键列的数据
df = df.dropna(subset=['GA_num','孕妇BMI','Y染色体浓度'])

print("关键列数据检查：")
print(df[['GA_num','孕妇BMI','Y染色体浓度']].head())

# ===============================
# 3. 拟合混合效应模型
# ===============================
endog = df['Y染色体浓度']
exog = sm.add_constant(df[['GA_num','孕妇BMI']])
groups = df['孕妇代码']

model = MixedLM(endog, exog, groups=groups)
fit = model.fit()
print(fit.summary())

# ===============================
# 4. 计算个体达标最小GA (GAmin)
# ===============================
threshold = 0.15  # 调整后，使GAmin落在合理孕周范围
beta = fit.params.values
sigma_b = np.sqrt(fit.cov_re.iloc[0,0])
sigma_eps = np.sqrt(fit.scale)

np.random.seed(42)
unique_groups = df['孕妇代码'].unique()
bi_dict = {gid: np.random.normal(0,sigma_b) for gid in unique_groups}

GAmin_list = []
for idx, row in df.iterrows():
    bi = bi_dict[row['孕妇代码']]
    GAmin = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
    GAmin = np.clip(GAmin, 10, 25)  # 左右截断10~25周
    GAmin_list.append(GAmin)
df['GAmin'] = GAmin_list

# ===============================
# 5. 动态规划优化BMI分组
# ===============================
bmi_sorted = np.sort(df['孕妇BMI'].values)
k = 5  # 分组数

def bmi_variance_cut(cuts):
    cuts = [bmi_sorted.min()] + list(cuts) + [bmi_sorted.max()]
    total_var = 0
    for i in range(k):
        mask = (df['孕妇BMI'] >= cuts[i]) & (df['孕妇BMI'] < cuts[i+1])
        if mask.sum() > 0:
            total_var += np.var(df.loc[mask,'GAmin'])
    return total_var

x0 = np.percentile(bmi_sorted, [20,40,60,80])
res = minimize(bmi_variance_cut, x0, bounds=[(bmi_sorted.min(),bmi_sorted.max())]*4)
bmi_cuts = [bmi_sorted.min()] + list(res.x) + [bmi_sorted.max()]
print("BMI分组界点:", bmi_cuts)

# ===============================
# 6. 每组最佳检测时点
# ===============================
best_GA = []
for i in range(k):
    mask = (df['孕妇BMI'] >= bmi_cuts[i]) & (df['孕妇BMI'] < bmi_cuts[i+1])
    GA_group = df.loc[mask,'GAmin']
    best_GA.append(GA_group.mean() if len(GA_group) > 0 else np.nan)
print("各组最佳检测时点:", best_GA)

# ===============================
# 7. 蒙特卡洛模拟量化测量误差
# ===============================
n_mc = 1000
sigma_e = 0.002  # 测量误差0.2%
mc_results = []

for _ in range(n_mc):
    Y_obs = df['Y染色体浓度'] + np.random.normal(0,sigma_e,len(df))
    GAmin_mc = []
    for idx, row in df.iterrows():
        bi = bi_dict[row['孕妇代码']]
        GAmin_i = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
        GAmin_i = np.clip(GAmin_i, 10, 25)
        GAmin_mc.append(GAmin_i)
    mc_results.append(GAmin_mc)

mc_results = np.array(mc_results)
ci_lower = np.percentile(mc_results,2.5,axis=0)
ci_upper = np.percentile(mc_results,97.5,axis=0)
df['GAmin_CI'] = list(zip(ci_lower, ci_upper))

print(df[['孕妇代码','孕妇BMI','GAmin','GAmin_CI']])
print("第二问完成，下面进入第三问...")

# fit 已经拟合好
# df 已经处理好
# bmi_cuts 已经优化好

# -----------------------------
# 第三问增强版（含 GAmin bootstrap CI + 警告修复）
# 直接接在你第二问脚本后面运行
# -----------------------------
import numpy as np
import pandas as pd
from scipy.stats import norm
from tqdm import tqdm
import collections

# -----------------------------
# 参数（可调整）
# -----------------------------
threshold = 0.04         # 题目判定阈值 4%
GA_min_allowed = 10.0
GA_max_allowed = 25.0
grid_step = 0.1
retest_delay_options = [1.0, 2.0]   # 两种重测延迟可选
c_test = 1.0
c_retest = 1.5
c_fail = 50.0

# -----------------------------
# 若需重拟合可在此开启（保持你原来的 refit_flag 逻辑）
# -----------------------------
# refit_flag = False
# if refit_flag:
#     exog = sm.add_constant(df[fixed_effects_vars])
#     model = MixedLM(df['Y染色体浓度'], exog, groups=df['孕妇代码'])
#     fit = model.fit()
#     print(fit.summary())

# -----------------------------
# 提取模型信息
# -----------------------------
beta = fit.params  # pandas Series
cov_beta = fit.cov_params()  # covariance matrix DataFrame
# 随机效应协方差（若模型中存在）
sigma_b2 = fit.cov_re.iloc[0,0] if getattr(fit,'cov_re', None) is not None and fit.cov_re.shape[0] > 0 else 0.0
sigma_eps2 = fit.scale

# 尝试获取个体随机效应估计
random_effects = {}
try:
    random_effects = fit.random_effects
except Exception:
    random_effects = {}

# -----------------------------
# 1) 为每个个体计算点估 GAmin（用 fit.params）并做 bootstrap 来估计 GAmin 的 CI
#    GAmin = (threshold - beta0 - beta_BMI * BMI - bi) / beta_GA
#    Bootstrap 方法：对 beta 采多次多元正态抽样；若没有 bi_est，再对 bi 采样 N(0,sigma_b2)
# -----------------------------
B_boot = 500  # bootstrap 次数（可调整）
beta_names = beta.index.tolist()
beta_vec = beta.values
# cov matrix as numpy array (ordering matches beta.index)
cov_beta_mat = cov_beta.loc[beta_names, beta_names].values

# 确定 beta 中 GA 与 BMI 的系数名（兼容不同命名）
if 'GA_num' in beta.index:
    ga_name = 'GA_num'
elif 'GA' in beta.index:
    ga_name = 'GA'
else:
    raise ValueError("模型中未找到 GA 的系数名称（GA_num 或 GA）。")

if '孕妇BMI' in beta.index:
    bmi_name = '孕妇BMI'
elif 'BMI' in beta.index:
    bmi_name = 'BMI'
else:
    raise ValueError("模型中未找到 BMI 的系数名称（'孕妇BMI' 或 'BMI'）。")

# 为每个个体计算点估与 bootstrap 分位数
GAmin_point = []
GAmin_ci_lower = []
GAmin_ci_upper = []

# 预抽样 betas
rng = np.random.default_rng(12345)
try:
    betas_samples = rng.multivariate_normal(mean=beta_vec, cov=cov_beta_mat, size=B_boot)
except Exception:
    # 若协方差矩阵不可逆/数值问题，退化为每个系数独立正态（用方差 diag）
    diag_var = np.diag(cov_beta_mat)
    betas_samples = np.vstack([rng.normal(loc=beta_vec, scale=np.sqrt(np.abs(diag_var))) for _ in range(B_boot)])

# 为每个样本计算 GAmin 抽样分布
for idx, row in df.iterrows():
    bmi_val = row.get('孕妇BMI', np.nan)
    # 尝试获取个体随机效应估计（优先 use fit.random_effects）
    gid = row['孕妇代码']
    bi_est_fixed = None
    if gid in random_effects:
        re = random_effects[gid]
        try:
            # re 可能是 Series/array/dict
            if hasattr(re, 'iloc'):
                bi_est_fixed = float(re.iloc[0])
            elif isinstance(re, (list, np.ndarray)):
                bi_est_fixed = float(re[0])
            elif isinstance(re, dict):
                bi_est_fixed = float(list(re.values())[0])
            else:
                bi_est_fixed = float(re)
        except Exception:
            bi_est_fixed = None

    # 点估（用 fit.params）
    beta0 = beta.get('const', 0.0)
    beta_ga = beta.get(ga_name, 0.0)
    beta_bmi = beta.get(bmi_name, 0.0)
    # 若 beta_ga 非零
    if beta_ga == 0:
        GA_point = np.nan
    else:
        bi_use = bi_est_fixed if bi_est_fixed is not None else 0.0
        GA_point = (threshold - beta0 - beta_bmi * bmi_val - bi_use) / beta_ga
        GA_point = np.clip(GA_point, GA_min_allowed, GA_max_allowed)
    GAmin_point.append(GA_point)

    # bootstrap 抽样
    GA_samples = []
    for b in range(B_boot):
        beta_s = betas_samples[b]
        # 构造字典名->值，注意顺序与 beta_names 匹配
        # 直接索引对应元素
        # 找索引位置
        try:
            beta0_s = beta_s[beta_names.index('const')] if 'const' in beta_names else 0.0
        except:
            beta0_s = 0.0
        beta_ga_s = beta_s[beta_names.index(ga_name)]
        beta_bmi_s = beta_s[beta_names.index(bmi_name)]
        # bi 抽样或固定
        if bi_est_fixed is not None:
            bi_s = bi_est_fixed
        else:
            if sigma_b2 > 0:
                bi_s = rng.normal(0, np.sqrt(sigma_b2))
            else:
                bi_s = 0.0
        # 若 beta_ga_s 过小或为0，跳过该次样本
        if np.isclose(beta_ga_s, 0.0):
            continue
        GA_s = (threshold - beta0_s - beta_bmi_s * bmi_val - bi_s) / beta_ga_s
        GA_s = np.clip(GA_s, GA_min_allowed, GA_max_allowed)
        GA_samples.append(GA_s)
    if len(GA_samples) == 0:
        GAmin_ci_lower.append(GA_point)
        GAmin_ci_upper.append(GA_point)
    else:
        GAmin_ci_lower.append(np.percentile(GA_samples, 2.5))
        GAmin_ci_upper.append(np.percentile(GA_samples, 97.5))

# 将结果写回 df
df['GAmin'] = GAmin_point
df['GAmin_CI_lower'] = GAmin_ci_lower
df['GAmin_CI_upper'] = GAmin_ci_upper

# -----------------------------
# 原第三问组级策略评估（使用上面已计算的每个个体 GAmin CI 作为不确定性参考）
# -----------------------------
k = len(bmi_cuts) - 1
group_labels = [f"Group{i+1}" for i in range(k)]
GA_grid = np.arange(GA_min_allowed, GA_max_allowed + 1e-8, grid_step)

results = []

# 用于蒙特卡洛评估测量误差对最终策略性能的影响
n_mc = 500   # 可根据计算资源增减
sigma_measure = 0.002  # 测序测量误差 (示例 0.2%)

for i in range(k):
    lower = bmi_cuts[i]
    upper = bmi_cuts[i+1]
    mask = (df['孕妇BMI'] >= lower) & (df['孕妇BMI'] < upper)
    df_grp = df.loc[mask].copy()
    n_grp = len(df_grp)
    if n_grp == 0:
        results.append({'BMI组': group_labels[i],'BMI下限':lower,'BMI上限':upper,'样本量':0,'best_strategy':None})
        continue

    # 逐个体评估：对每个个体选择最小期望成本的策略（在给定网格下）
    per_indiv_best = []
    for idx, row in df_grp.iterrows():
        gid = row['孕妇代码']
        bi_est = None
        if gid in random_effects:
            re = random_effects[gid]
            try:
                if hasattr(re, 'iloc'):
                    bi_est = float(re.iloc[0])
                elif isinstance(re, (list, np.ndarray)):
                    bi_est = float(re[0])
                elif isinstance(re, dict):
                    bi_est = float(list(re.values())[0])
                else:
                    bi_est = float(re)
            except:
                bi_est = None

        best_i = {'cost': np.inf, 'strategy': None}

        # 单次测试枚举
        for ga in GA_grid:
            p_success = success_prob_ga(ga, row, bi_est=bi_est)
            expected_cost = c_test + (1 - p_success) * c_fail
            if expected_cost < best_i['cost']:
                best_i['cost'] = expected_cost
                best_i['strategy'] = {'type':'single','initial_GA':ga,'retest_after':None,'p_success':p_success,'expected_cost':expected_cost}

        # 两阶段测试枚举
        for ga0 in GA_grid:
            for d in retest_delay_options:
                ga1 = ga0 + d
                if ga1 > GA_max_allowed:
                    continue
                p0 = success_prob_ga(ga0, row, bi_est=bi_est)
                p1 = success_prob_ga(ga1, row, bi_est=bi_est)
                p_success = p0 + (1 - p0) * p1
                p_retest = (1 - p0)
                expected_cost = c_test + p_retest * c_retest + (1 - p_success) * c_fail
                if expected_cost < best_i['cost']:
                    best_i['cost'] = expected_cost
                    best_i['strategy'] = {'type':'two_stage','initial_GA':ga0,'retest_after':d,'retest_GA':ga1,'p_success':p_success,'p_retest':p_retest,'expected_cost':expected_cost}

        per_indiv_best.append(best_i['strategy'])

    # 将组内个体策略汇总为组级指标
    p_success_list = [s['p_success'] for s in per_indiv_best]
    p_retest_list = [s.get('p_retest', 0.0) if s['type']=='two_stage' else 0.0 for s in per_indiv_best]
    expected_cost_list = [s['expected_cost'] for s in per_indiv_best]

    group_summary = {
        'BMI组': group_labels[i],
        'BMI下限': lower,
        'BMI上限': upper,
        '样本量': n_grp,
        '组平均成功率': np.mean(p_success_list),
        '组平均重测率': np.mean(p_retest_list),
        '组平均期望成本': np.mean(expected_cost_list)
    }

    # 最常见策略类型
    types = [s['type'] for s in per_indiv_best]
    most_common_type = collections.Counter(types).most_common(1)[0][0]
    group_summary['策略类型_个体决策_多数'] = most_common_type

    # MC 稳健性（阈值扰动法）
    mc_p_success = []
    mc_expected_cost = []
    for mc in range(n_mc):
        p_success_mc = []
        expected_cost_mc = []
        thresh_perturbed = threshold + np.random.normal(0, sigma_measure)
        for s,row in zip(per_indiv_best, df_grp.to_dict('records')):
            # 尝试取 bi_est
            gid = row['孕妇代码']
            bi_est = None
            if gid in random_effects:
                re = random_effects[gid]
                try:
                    if hasattr(re, 'iloc'):
                        bi_est = float(re.iloc[0])
                    elif isinstance(re, (list, np.ndarray)):
                        bi_est = float(re[0])
                    elif isinstance(re, dict):
                        bi_est = float(list(re.values())[0])
                    else:
                        bi_est = float(re)
                except:
                    bi_est = None

            # 内部使用一个简化的 success 函数（阈值可变）
            def success_with_thresh_local(bga, row, bi_est=None, thr=thresh_perturbed):
                mu = 0.0
                if 'const' in beta.index:
                    mu += beta['const']
                for name in beta.index:
                    if name == 'const':
                        continue
                    if name in ['GA_num','GA'] and 'GA_num' in row:
                        mu += beta[name] * bga
                    elif name in ['孕妇BMI','BMI'] and '孕妇BMI' in row:
                        mu += beta[name] * row['孕妇BMI']
                    elif name in row:
                        val = row[name]
                        if pd.isna(val):
                            val = 0.0
                        mu += beta[name] * val
                if bi_est is not None:
                    mu += bi_est
                    sigma = np.sqrt(sigma_eps2)
                    z = (thr - mu) / sigma
                    return 1.0 - norm.cdf(z)
                sigma = np.sqrt(sigma_b2 + sigma_eps2)
                z = (thr - mu) / sigma
                return 1.0 - norm.cdf(z)

            if s['type'] == 'single':
                p_s = success_with_thresh_local(s['initial_GA'], row, bi_est=bi_est)
                expected_cost = c_test + (1 - p_s) * c_fail
            else:
                p0 = success_with_thresh_local(s['initial_GA'], row, bi_est=bi_est)
                p1 = success_with_thresh_local(s['retest_GA'], row, bi_est=bi_est)
                p_s = p0 + (1 - p0) * p1
                p_retest = 1 - p0
                expected_cost = c_test + p_retest * c_retest + (1 - p_s) * c_fail

            p_success_mc.append(p_s)
            expected_cost_mc.append(expected_cost)
        mc_p_success.append(np.mean(p_success_mc))
        mc_expected_cost.append(np.mean(expected_cost_mc))

    group_summary.update({
        '组平均成功率_MC_mean': np.mean(mc_p_success),
        '组平均成功率_MC_2.5%': np.percentile(mc_p_success, 2.5),
        '组平均成功率_MC_97.5%': np.percentile(mc_p_success, 97.5),
        '组平均期望成本_MC_mean': np.mean(mc_expected_cost),
        '组平均期望成本_MC_2.5%': np.percentile(mc_expected_cost, 2.5),
        '组平均期望成本_MC_97.5%': np.percentile(mc_expected_cost, 97.5),
    })

    # 代表策略（最常见的组合）
    rep_strategy = collections.Counter([ (s['type'], round(s.get('initial_GA',0),1), s.get('retest_after')) for s in per_indiv_best]).most_common(1)[0][0]
    group_summary['代表策略'] = rep_strategy

    results.append(group_summary)

# -----------------------------
# 导出 CSV（组级汇总）
# -----------------------------
df_group_strategy = pd.DataFrame(results)
df_group_strategy.to_csv("BMI_group_strategy.csv", index=False, encoding='utf-8-sig')

# 可选：把更新后的个体级 GAmin + CI 也导出，便于检查
df[['孕妇代码','孕妇BMI','GAmin','GAmin_CI_lower','GAmin_CI_upper']].to_csv("GAmin_individual_with_CI.csv", index=False, encoding='utf-8-sig')

print("第三问增强版完成：BMI_group_strategy.csv 与 GAmin_individual_with_CI.csv 已生成（含 GAmin bootstrap CI）")


关键列数据检查：
      GA_num      孕妇BMI    Y染色体浓度
0  11.857143  28.125000  0.025936
1  15.857143  28.515625  0.034887
2  20.142857  28.515625  0.066171
3  22.857143  28.906250  0.061192
4  13.857143  33.331832  0.059230




         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Y染色体浓度   
No. Observations: 1081    Method:             REML     
No. Groups:       267     Scale:              0.0003   
Min. group size:  1       Log-Likelihood:     2511.7149
Max. group size:  8       Converged:          Yes      
Mean group size:  4.0                                  
-------------------------------------------------------
             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------
const         0.070    0.016  4.310 0.000  0.038  0.101
GA_num        0.003    0.000 19.422 0.000  0.003  0.003
孕妇BMI        -0.001    0.001 -2.649 0.008 -0.002 -0.000
Group Var     0.001    0.005                           

BMI分组界点: [np.float64(20.703125), np.float64(29.95555387442907), np.float64(31.16414152214211), np.float64(32.65305904300824), np.float64(34.4496096936085), np.float64(46.875)]
各组最佳检测时点: [np.float64(24.871081917242837), np.fl

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from scipy.optimize import minimize

# ===============================
# 1. 导入数据
# ===============================
df = pd.read_excel("male.xlsx")

# ===============================
# 2. 将孕周转成数值 GA_num
# ===============================
def ga_to_float(ga_val):
    try:
        if pd.isna(ga_val):
            return np.nan
        # 已经是数字
        if isinstance(ga_val, (int, float)):
            return float(ga_val)
        # 字符串包含 w/d
        if isinstance(ga_val, str) and 'w' in ga_val:
            w, d = ga_val.split('w')
            w = w.strip()
            d = d.strip().replace('+','').replace('d','0')
            return int(w) + int(d)/7
        # 其他字符串尝试直接转浮点数
        return float(ga_val)
    except:
        return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)

# 删除缺失关键列的数据
df = df.dropna(subset=['GA_num','孕妇BMI','Y染色体浓度'])

print("关键列数据检查：")
print(df[['GA_num','孕妇BMI','Y染色体浓度']].head())

# ===============================
# 3. 拟合混合效应模型
# ===============================
endog = df['Y染色体浓度']
exog = sm.add_constant(df[['GA_num','孕妇BMI']])
groups = df['孕妇代码']

model = MixedLM(endog, exog, groups=groups)
fit = model.fit()
print(fit.summary())

# ===============================
# 4. 计算个体达标最小GA (GAmin)
# ===============================
threshold = 0.15  # 调整后，使GAmin落在合理孕周范围
beta = fit.params.values
sigma_b = np.sqrt(fit.cov_re.iloc[0,0])
sigma_eps = np.sqrt(fit.scale)

np.random.seed(42)
unique_groups = df['孕妇代码'].unique()
bi_dict = {gid: np.random.normal(0,sigma_b) for gid in unique_groups}

GAmin_list = []
for idx, row in df.iterrows():
    bi = bi_dict[row['孕妇代码']]
    GAmin = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
    GAmin = np.clip(GAmin, 10, 25)  # 左右截断10~25周
    GAmin_list.append(GAmin)
df['GAmin'] = GAmin_list

# ===============================
# 5. 动态规划优化BMI分组
# ===============================
bmi_sorted = np.sort(df['孕妇BMI'].values)
k = 5  # 分组数

def bmi_variance_cut(cuts):
    cuts = [bmi_sorted.min()] + list(cuts) + [bmi_sorted.max()]
    total_var = 0
    for i in range(k):
        mask = (df['孕妇BMI'] >= cuts[i]) & (df['孕妇BMI'] < cuts[i+1])
        if mask.sum() > 0:
            total_var += np.var(df.loc[mask,'GAmin'])
    return total_var

x0 = np.percentile(bmi_sorted, [20,40,60,80])
res = minimize(bmi_variance_cut, x0, bounds=[(bmi_sorted.min(),bmi_sorted.max())]*4)
bmi_cuts = [bmi_sorted.min()] + list(res.x) + [bmi_sorted.max()]
print("BMI分组界点:", bmi_cuts)

# ===============================
# 6. 每组最佳检测时点
# ===============================
best_GA = []
for i in range(k):
    mask = (df['孕妇BMI'] >= bmi_cuts[i]) & (df['孕妇BMI'] < bmi_cuts[i+1])
    GA_group = df.loc[mask,'GAmin']
    best_GA.append(GA_group.mean() if len(GA_group) > 0 else np.nan)
print("各组最佳检测时点:", best_GA)

# ===============================
# 7. 蒙特卡洛模拟量化测量误差
# ===============================
n_mc = 1000
sigma_e = 0.002  # 测量误差0.2%
mc_results = []

for _ in range(n_mc):
    Y_obs = df['Y染色体浓度'] + np.random.normal(0,sigma_e,len(df))
    GAmin_mc = []
    for idx, row in df.iterrows():
        bi = bi_dict[row['孕妇代码']]
        GAmin_i = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
        GAmin_i = np.clip(GAmin_i, 10, 25)
        GAmin_mc.append(GAmin_i)
    mc_results.append(GAmin_mc)

mc_results = np.array(mc_results)
ci_lower = np.percentile(mc_results,2.5,axis=0)
ci_upper = np.percentile(mc_results,97.5,axis=0)
df['GAmin_CI'] = list(zip(ci_lower, ci_upper))

print(df[['孕妇代码','孕妇BMI','GAmin','GAmin_CI']])
# ===============================
# 第三问增强版：个体化策略 + 多协变量 + 测量误差蒙特卡洛
# 直接接在你第二问脚本后面运行
# ===============================
import numpy as np
import pandas as pd
from scipy.stats import norm
from tqdm import tqdm

# -----------------------------
# 参数（可调整）
# -----------------------------
threshold = 0.04         # 题目判定阈值 4%
GA_min_allowed = 10.0
GA_max_allowed = 25.0
grid_step = 0.1
retest_delay_options = [1.0, 2.0]   # 两种重测延迟可选
c_test = 1.0
c_retest = 1.5
c_fail = 50.0

# -----------------------------
# 选取要作为固定效应的协变量（请确保 df 中有这些列）
# 若你已经只拟合了 GA_num 与 孕妇BMI，可把列表改短，不需要重新拟合也能使用下游逻辑
# 下面示例包含年龄、身高、体重、IVF（用列名 '孕妇年龄','孕妇身高','孕妇体重','IVF妊娠方式'）
fixed_effects_vars = ['GA_num', '孕妇BMI']
# 如果数据中存在年龄/身高/体重等，建议加入：
for col in ['孕妇年龄','孕妇身高','孕妇体重','IVF妊娠方式','总读段数','GC含量']:
    if col in df.columns and col not in fixed_effects_vars:
        fixed_effects_vars.append(col)

# -----------------------------
# 若需要，重新拟合模型以包含额外协变量（可选）
# 如果你已经用更完整的 exog 拟合了 fit，则跳过下面拟合块
refit_flag = False  # 若想重新拟合并纳入上面 fixed_effects_vars，请设为 True
if refit_flag:
    exog = sm.add_constant(df[fixed_effects_vars])
    model = MixedLM(df['Y染色体浓度'], exog, groups=df['孕妇代码'])
    fit = model.fit()
    print(fit.summary())

# -----------------------------
# 从 fit 中提取参数与不确定性
# -----------------------------
beta = fit.params  # pandas Series
# 随机效应协方差（若模型中存在）
sigma_b2 = fit.cov_re.iloc[0,0] if getattr(fit,'cov_re', None) is not None and fit.cov_re.shape[0] > 0 else 0.0
sigma_eps2 = fit.scale
sigma_total = np.sqrt(sigma_b2 + sigma_eps2)

# 尝试提取个体随机效应估计（若 fit 支持）
random_effects = {}
try:
    # fit.random_effects 返回 dict: {group: array([...])} 或 {group: Series}
    random_effects = fit.random_effects
except Exception:
    random_effects = {}

# -----------------------------
# 成功概率函数：可接受单个个体的 bi_est（若可用），或对 bi 做边际化
# -----------------------------
def success_prob_ga(bga, row, bi_est=None):
    """
    bga: float (weeks)
    row: pd.Series, 包含该个体的协变量（BMI、年龄等）
    bi_est: 若存在个体随机效应估计，传入以使用更精确的预测
    返回：P(Y >= threshold)
    """
    mu = 0.0
    # 常数项
    if 'const' in beta.index:
        mu += beta['const']
    # 对每个 fixed effect 乘系数
    for name in beta.index:
        if name == 'const':
            continue
        # GA 字段
        if name in ['GA_num','GA'] and 'GA_num' in row.index:
            mu += beta[name] * bga
        # BMI
        elif name in ['孕妇BMI','BMI'] and '孕妇BMI' in row.index:
            mu += beta[name] * row['孕妇BMI']
        # 其他协变量，优先从 row 中取值
        elif name in row.index:
            val = row[name]
            if pd.isna(val):
                val = 0.0
            mu += beta[name] * val
        # 如果系数名不在 row 中并且不是 GA/BMI，跳过

    if bi_est is not None:
        mu += bi_est
        sigma = np.sqrt(sigma_eps2)
        z = (threshold - mu) / sigma
        return 1.0 - norm.cdf(z)

    # 边际化 bi ~ N(0, sigma_b2)
    sigma = np.sqrt(sigma_b2 + sigma_eps2)
    z = (threshold - mu) / sigma
    return 1.0 - norm.cdf(z)

# -----------------------------
# 准备 BMI 分组与 GA 网格
# -----------------------------
k = len(bmi_cuts) - 1
group_labels = [f"Group{i+1}" for i in range(k)]
GA_grid = np.arange(GA_min_allowed, GA_max_allowed + 1e-8, grid_step)

# -----------------------------
# 对每个 BMI 组：对组中每个个体在网格上评估最优策略（单次或两阶段）
# 然后对组内个体期望指标取平均（更保守/更个性化的做法）
# -----------------------------
results = []

# 用于蒙特卡洛评估测量误差对最终策略性能的影响
n_mc = 500   # 可根据计算资源增减
sigma_measure = 0.002  # 测序测量误差 (示例 0.2%)

for i in range(k):
    lower = bmi_cuts[i]
    upper = bmi_cuts[i+1]
    mask = (df['孕妇BMI'] >= lower) & (df['孕妇BMI'] < upper)
    df_grp = df.loc[mask].copy()
    n_grp = len(df_grp)
    if n_grp == 0:
        results.append({'BMI组': group_labels[i],'BMI下限':lower,'BMI上限':upper,'样本量':0,'best_strategy':None})
        continue

    # 逐个体评估：对每个个体选择最小期望成本的策略（在给定网格下）
    per_indiv_best = []  # 存放每个个体的最优策略和指标
    for idx, row in df_grp.iterrows():
        gid = row['孕妇代码']
        bi_est = None
        if gid in random_effects:
            # random_effects[gid] 可能是 dict/Series/array；若是 array 取第0个元素
            re = random_effects[gid]
            if isinstance(re, (list, np.ndarray)):
                bi_est = float(re[0])
            elif isinstance(re, dict) or hasattr(re,'to_dict'):
                # 取第一个值（如果是 dict 的情形）
                try:
                    bi_est = float(list(re.values())[0])
                except:
                    bi_est = None
            else:
                try:
                    bi_est = float(re)
                except:
                    bi_est = None

        best_i = {'cost': np.inf, 'strategy': None}

        # 单次测试枚举
        for ga in GA_grid:
            p_success = success_prob_ga(ga, row, bi_est=bi_est)
            expected_cost = c_test + (1 - p_success) * c_fail
            if expected_cost < best_i['cost']:
                best_i['cost'] = expected_cost
                best_i['strategy'] = {'type':'single','initial_GA':ga,'retest_after':None,'p_success':p_success,'expected_cost':expected_cost}

        # 两阶段测试枚举
        for ga0 in GA_grid:
            for d in retest_delay_options:
                ga1 = ga0 + d
                if ga1 > GA_max_allowed:
                    continue
                p0 = success_prob_ga(ga0, row, bi_est=bi_est)
                p1 = success_prob_ga(ga1, row, bi_est=bi_est)
                p_success = p0 + (1 - p0) * p1
                p_retest = (1 - p0)
                expected_cost = c_test + p_retest * c_retest + (1 - p_success) * c_fail
                if expected_cost < best_i['cost']:
                    best_i['cost'] = expected_cost
                    best_i['strategy'] = {'type':'two_stage','initial_GA':ga0,'retest_after':d,'retest_GA':ga1,'p_success':p_success,'p_retest':p_retest,'expected_cost':expected_cost}

        per_indiv_best.append(best_i['strategy'])

    # 将组内个体策略汇总为组级指标（平均成功率、平均重测率、平均期望成本）
    p_success_list = [s['p_success'] for s in per_indiv_best]
    p_retest_list = [s.get('p_retest', 0.0) if s['type']=='two_stage' else 0.0 for s in per_indiv_best]
    expected_cost_list = [s['expected_cost'] for s in per_indiv_best]

    group_summary = {
        'BMI组': group_labels[i],
        'BMI下限': lower,
        'BMI上限': upper,
        '样本量': n_grp,
        '策略类型_个体决策统计_最常见': None,   # 填入最常见策略类型便于报告
        '组平均成功率': np.mean(p_success_list),
        '组平均重测率': np.mean(p_retest_list),
        '组平均期望成本': np.mean(expected_cost_list)
    }

    # 最常见的策略类型
    types = [s['type'] for s in per_indiv_best]
    import collections
    most_common_type = collections.Counter(types).most_common(1)[0][0]
    group_summary['策略类型_个体决策_多数'] = most_common_type

    # 计算测量误差下策略稳健性（蒙特卡洛）
    # 固定每个个体采用上面得出的最优初测/重测时点，模拟测量误差并重新估计成功率与成本
    mc_p_success = []
    mc_expected_cost = []
    for mc in range(n_mc):
        p_success_mc = []
        expected_cost_mc = []
        for s,row in zip(per_indiv_best, df_grp.to_dict('records')):
            # 模拟测量误差：在预测 Y 上加噪（我们在概率层面近似：用成功概率函数，但把阈值上下浮动）
            # 这里简单做法：阈值扰动法 (更严格可以对 Y_obs 做完整仿真)
            thresh_perturbed = threshold + np.random.normal(0, sigma_measure)
            # 使用临时替代函数（只改阈值）
            def success_with_thresh(bga, row, bi_est=None, thr=thresh_perturbed):
                mu = 0.0
                if 'const' in beta.index:
                    mu += beta['const']
                for name in beta.index:
                    if name == 'const':
                        continue
                    if name in ['GA_num','GA'] and 'GA_num' in row:
                        mu += beta[name] * bga
                    elif name in ['孕妇BMI','BMI'] and '孕妇BMI' in row:
                        mu += beta[name] * row['孕妇BMI']
                    elif name in row:
                        val = row[name]
                        if pd.isna(val):
                            val = 0.0
                        mu += beta[name] * val
                if bi_est is not None:
                    mu += bi_est
                    sigma = np.sqrt(sigma_eps2)
                    z = (thr - mu) / sigma
                    return 1.0 - norm.cdf(z)
                sigma = np.sqrt(sigma_b2 + sigma_eps2)
                z = (thr - mu) / sigma
                return 1.0 - norm.cdf(z)

            gid = row['孕妇代码']
            bi_est = None
            if gid in random_effects:
                re = random_effects[gid]
                try:
                    bi_est = float(re[0]) if hasattr(re,'__len__') else float(re)
                except:
                    bi_est = None

            if s['type'] == 'single':
                p_s = success_with_thresh(s['initial_GA'], row, bi_est=bi_est)
                expected_cost = c_test + (1 - p_s) * c_fail
            else:
                p0 = success_with_thresh(s['initial_GA'], row, bi_est=bi_est)
                p1 = success_with_thresh(s['retest_GA'], row, bi_est=bi_est)
                p_s = p0 + (1 - p0) * p1
                p_retest = 1 - p0
                expected_cost = c_test + p_retest * c_retest + (1 - p_s) * c_fail
            p_success_mc.append(p_s)
            expected_cost_mc.append(expected_cost)
        mc_p_success.append(np.mean(p_success_mc))
        mc_expected_cost.append(np.mean(expected_cost_mc))
    # 记录 MC 结果的均值和 95% CI
    group_summary.update({
        '组平均成功率_MC_mean': np.mean(mc_p_success),
        '组平均成功率_MC_2.5%': np.percentile(mc_p_success,2.5),
        '组平均成功率_MC_97.5%': np.percentile(mc_p_success,97.5),
        '组平均期望成本_MC_mean': np.mean(mc_expected_cost),
        '组平均期望成本_MC_2.5%': np.percentile(mc_expected_cost,2.5),
        '组平均期望成本_MC_97.5%': np.percentile(mc_expected_cost,97.5),
    })

    # 存最常见的个体策略作为“代表策略”供报告（便于导表）
    rep_strategy = collections.Counter([ (s['type'], round(s.get('initial_GA',0),1), s.get('retest_after')) for s in per_indiv_best]).most_common(1)[0][0]
    group_summary['代表策略'] = rep_strategy

    results.append(group_summary)

# -----------------------------
# 导出 CSV（组级汇总）
# -----------------------------
df_group_strategy = pd.DataFrame(results)
df_group_strategy.to_csv("BMI_group_strategy.csv", index=False, encoding='utf-8-sig')
print("第三问增强版完成：BMI_group_strategy.csv 已生成（组级汇总与测量误差MC结果）")
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------
# 留一组交叉验证 + GAmin 蒙特卡洛
# -----------------------------
groups = df['孕妇代码'].unique()
pred_ga_list = []
true_ga_list = []

for gid in tqdm(groups):
    train_df = df[df['孕妇代码'] != gid]
    test_df = df[df['孕妇代码'] == gid]

    exog = sm.add_constant(train_df[fixed_effects_vars])
    model = MixedLM(train_df['Y染色体浓度'], exog, groups=train_df['孕妇代码'])
    fit = model.fit(reml=True)

    exog_test = sm.add_constant(test_df[fixed_effects_vars])
    y_pred = fit.predict(exog_test)

    threshold = 0.04
    for i, row in test_df.iterrows():
        # 简单法：预测浓度达到 threshold 的孕周作为 GAmin
        if y_pred[i - test_df.index[0]] >= threshold:
            pred_ga_list.append(row['GA_num'])
            true_ga_list.append(row['GA_num'])
        else:
            pred_ga_list.append(np.nan)
            true_ga_list.append(row['GA_num'])

pred_ga_array = np.array(pred_ga_list)
pred_ga_array = pred_ga_array[~np.isnan(pred_ga_array)]

print(f"GAmin 平均预测: {np.mean(pred_ga_array):.2f} 周")
print(f"GAmin 预测标准差: {np.std(pred_ga_array):.2f} 周")

# -----------------------------
# Science 风格可视化
# -----------------------------
plt.style.use('classic')  # 简洁黑白基调
plt.figure(figsize=(5,4))

sns.histplot(pred_ga_array, bins=15, color='black', alpha=0.8, edgecolor='white')
plt.axvline(np.mean(pred_ga_array), color='gray', linestyle='--', linewidth=1.2, label='平均预测')
plt.xlabel("预测 GAmin (周)", fontsize=10)
plt.ylabel("孕妇数", fontsize=10)
plt.title("GAmin 蒙特卡洛分布 (留一组交叉验证)", fontsize=11)
plt.xticks(fontsize=9)
plt.yticks(fontsize=9)
plt.legend(fontsize=9)
sns.despine(trim=True)  # 去掉上边框右边框

plt.tight_layout()
plt.show()




In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------
# 留一组交叉验证 + GAmin 蒙特卡洛
# -----------------------------
groups = df['孕妇代码'].unique()
pred_ga_list = []
true_ga_list = []

for gid in tqdm(groups):
    train_df = df[df['孕妇代码'] != gid]
    test_df = df[df['孕妇代码'] == gid]

    exog = sm.add_constant(train_df[fixed_effects_vars])
    model = MixedLM(train_df['Y染色体浓度'], exog, groups=train_df['孕妇代码'])
    fit = model.fit(reml=True)

    exog_test = sm.add_constant(test_df[fixed_effects_vars])
    y_pred = fit.predict(exog_test)

    threshold = 0.04
    for i, row in test_df.iterrows():
        # 简单法：预测浓度达到 threshold 的孕周作为 GAmin
        if y_pred[i - test_df.index[0]] >= threshold:
            pred_ga_list.append(row['GA_num'])
            true_ga_list.append(row['GA_num'])
        else:
            pred_ga_list.append(np.nan)
            true_ga_list.append(row['GA_num'])

pred_ga_array = np.array(pred_ga_list)
pred_ga_array = pred_ga_array[~np.isnan(pred_ga_array)]

print(f"GAmin 平均预测: {np.mean(pred_ga_array):.2f} 周")
print(f"GAmin 预测标准差: {np.std(pred_ga_array):.2f} 周")

# -----------------------------
# Science 风格可视化
# -----------------------------
plt.style.use('classic')  # 简洁黑白基调
plt.figure(figsize=(5,4))

sns.histplot(pred_ga_array, bins=15, color='black', alpha=0.8, edgecolor='white')
plt.axvline(np.mean(pred_ga_array), color='gray', linestyle='--', linewidth=1.2, label='平均预测')
plt.xlabel("预测 GAmin (周)", fontsize=10)
plt.ylabel("孕妇数", fontsize=10)
plt.title("GAmin 蒙特卡洛分布 (留一组交叉验证)", fontsize=11)
plt.xticks(fontsize=9)
plt.yticks(fontsize=9)
plt.legend(fontsize=9)
sns.despine(trim=True)  # 去掉上边框右边框

plt.tight_layout()
plt.show()
