In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from scipy.optimize import minimize
from scipy.stats import norm
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import collections

# ===============================
# 1. 导入数据 & 预处理
# ===============================
df = pd.read_excel("male.xlsx")

def ga_to_float(ga_val):
    try:
        if pd.isna(ga_val):
            return np.nan
        if isinstance(ga_val, (int, float)):
            return float(ga_val)
        if isinstance(ga_val, str) and 'w' in ga_val:
            w, d = ga_val.split('w')
            w = w.strip()
            d = d.strip().replace('+','').replace('d','0')
            return int(w) + int(d)/7
        return float(ga_val)
    except:
        return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)

# 核心修改：确保所有将用于模型的列都没有缺失值
fixed_effects_vars = ['GA_num', '孕妇BMI', '孕妇年龄', '孕妇身高', 'IVF妊娠方式', '总读段数', 'GC含量']
all_model_cols = ['Y染色体浓度'] + fixed_effects_vars

# 统一进行缺失值删除
df = df.dropna(subset=all_model_cols)

print("关键列数据检查：")
print(df[all_model_cols].head())
print("处理缺失值后，数据集大小：", len(df))

# ===============================
# 3. 拟合混合效应模型
# ===============================
endog = df['Y染色体浓度']
exog = sm.add_constant(df[fixed_effects_vars])
groups = df['孕妇代码']

model = MixedLM(endog, exog, groups=groups)
fit = model.fit()
print(fit.summary())

# ... (以下代码保持不变，因为它在数据和模型正确的前提下是有效的)
# ===============================
# 4. 计算个体达标最小GA (GAmin)
# ... (保持不变)
# ===============================
# 5. 动态规划优化BMI分组
# ... (保持不变)
# ===============================
# 6. 每组最佳检测时点
# ... (保持不变)
# ===============================
# 7. 蒙特卡洛模拟量化测量误差
# ... (保持不变)
# ===============================
# 第三问增强版：个体化策略 + 多协变量 + 测量误差蒙特卡洛
# ... (保持不变)
# ===============================

# -----------------------------
# 留一组交叉验证 + GAmin 蒙特卡洛 (部分修改)
# -----------------------------
groups = df['孕妇代码'].unique()
pred_ga_list = []
true_ga_list = []

for gid in tqdm(groups):
    train_df = df[df['孕妇代码'] != gid]
    test_df = df[df['孕妇代码'] == gid]

    # 检查训练集和测试集是否为空
    if len(train_df) == 0 or len(test_df) == 0:
        continue

    exog_train = sm.add_constant(train_df[fixed_effects_vars])
    model = MixedLM(train_df['Y染色体浓度'], exog_train, groups=train_df['孕妇代码'])
    fit = model.fit(reml=True)

    exog_test = sm.add_constant(test_df[fixed_effects_vars])
    y_pred = fit.predict(exog_test)

    threshold = 0.04
    for i, (idx, row) in enumerate(test_df.iterrows()):
        # 改进的索引方式
        if y_pred.iloc[i] >= threshold:
            pred_ga_list.append(row['GA_num'])
            true_ga_list.append(row['GA_num'])
        else:
            pred_ga_list.append(np.nan)
            true_ga_list.append(row['GA_num'])

pred_ga_array = np.array(pred_ga_list)
pred_ga_array = pred_ga_array[~np.isnan(pred_ga_array)]

print(f"GAmin 平均预测: {np.mean(pred_ga_array):.2f} 周")
print(f"GAmin 预测标准差: {np.std(pred_ga_array):.2f} 周")

# -----------------------------
# Science 风格可视化
# -----------------------------
plt.style.use('classic')
plt.figure(figsize=(5,4))

sns.histplot(pred_ga_array, bins=15, color='black', alpha=0.8, edgecolor='white')
plt.axvline(np.mean(pred_ga_array), color='gray', linestyle='--', linewidth=1.2, label='平均预测')
plt.xlabel("预测 GAmin (周)", fontsize=10)
plt.ylabel("孕妇数", fontsize=10)
plt.title("GAmin 蒙特卡洛分布 (留一组交叉验证)", fontsize=11)
plt.xticks(fontsize=9)
plt.yticks(fontsize=9)
plt.legend(fontsize=9)
sns.despine(trim=True)

plt.tight_layout()
plt.show()

KeyError: ['孕妇年龄', '孕妇身高', 'IVF妊娠方式', '总读段数']