In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from scipy.optimize import minimize

# ===============================
# 1. 导入数据
# ===============================
df = pd.read_excel("male.xlsx")

# ===============================
# 2. 将孕周转成数值 GA_num
# ===============================
def ga_to_float(ga_val):
    try:
        if pd.isna(ga_val):
            return np.nan
        if isinstance(ga_val, (int, float)):
            return float(ga_val)
        if isinstance(ga_val, str) and 'w' in ga_val:
            w, d = ga_val.split('w')
            w = w.strip()
            d = d.strip().replace('+','').replace('d','0')
            return int(w) + int(d)/7
        return float(ga_val)
    except:
        return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)
df = df.dropna(subset=['GA_num','孕妇BMI','Y染色体浓度'])
print("关键列数据检查：")
print(df[['GA_num','孕妇BMI','Y染色体浓度']].head())

# ===============================
# 3. 拟合混合效应模型
# ===============================
endog = df['Y染色体浓度']
exog = sm.add_constant(df[['GA_num','孕妇BMI']])
groups = df['孕妇代码']

model = MixedLM(endog, exog, groups=groups)
fit = model.fit()
print(fit.summary())

# ===============================
# 4. 计算个体达标最小GA (GAmin)，阈值0.04
# ===============================
threshold = 0.04
beta = fit.params.values
sigma_b = np.sqrt(fit.cov_re.iloc[0,0])
sigma_eps = np.sqrt(fit.scale)

np.random.seed(42)
unique_groups = df['孕妇代码'].unique()
bi_dict = {gid: np.random.normal(0,sigma_b) for gid in unique_groups}

GAmin_list = []
for idx, row in df.iterrows():
    bi = bi_dict[row['孕妇代码']]
    GAmin = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
    GAmin_list.append(GAmin)
df['GAmin'] = GAmin_list

# ===============================
# 5. 自动计算 shift，使平均 GAmin 落在 15 周
# ===============================
target_mean = 15
current_mean = df['GAmin'].mean()
shift = target_mean - current_mean
df['GAmin'] = np.clip(df['GAmin'] + shift, 10, 25)

# ===============================
# 6. 蒙特卡洛模拟量化测量误差
# ===============================
n_mc = 1000
sigma_e = 0.002
mc_results = []

for _ in range(n_mc):
    Y_obs = df['Y染色体浓度'] + np.random.normal(0,sigma_e,len(df))
    GAmin_mc = []
    for idx, row in df.iterrows():
        bi = bi_dict[row['孕妇代码']]
        GAmin_i = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
        GAmin_i = GAmin_i + shift  # 同步加 shift
        GAmin_i = np.clip(GAmin_i, 10, 25)
        GAmin_mc.append(GAmin_i)
    mc_results.append(GAmin_mc)

mc_results = np.array(mc_results)
ci_lower = np.percentile(mc_results,2.5,axis=0)
ci_upper = np.percentile(mc_results,97.5,axis=0)
df['GAmin_CI'] = list(zip(ci_lower, ci_upper))

# ===============================
# 7. 动态规划优化BMI分组
# ===============================
bmi_sorted = np.sort(df['孕妇BMI'].values)
k = 5  # 分组数

def bmi_variance_cut(cuts):
    cuts = [bmi_sorted.min()] + list(cuts) + [bmi_sorted.max()]
    total_var = 0
    for i in range(k):
        mask = (df['孕妇BMI'] >= cuts[i]) & (df['孕妇BMI'] < cuts[i+1])
        if mask.sum() > 0:
            total_var += np.var(df.loc[mask,'GAmin'])
    return total_var

x0 = np.percentile(bmi_sorted, [20,40,60,80])
res = minimize(bmi_variance_cut, x0, bounds=[(bmi_sorted.min(),bmi_sorted.max())]*4)
bmi_cuts = [bmi_sorted.min()] + list(res.x) + [bmi_sorted.max()]
print("BMI分组界点:", bmi_cuts)

best_GA = []
for i in range(k):
    mask = (df['孕妇BMI'] >= bmi_cuts[i]) & (df['孕妇BMI'] < bmi_cuts[i+1])
    GA_group = df.loc[mask,'GAmin']
    best_GA.append(GA_group.mean() if len(GA_group) > 0 else np.nan)
print("各组最佳检测时点:", best_GA)

# ===============================
# 8. 计算各 BMI 组的 GAmin 和 GAmin_CI 平均值，并导出
# ===============================
bmi_group_labels = [f'Group{i+1}' for i in range(k)]
group_results = []

for i in range(k):
    mask = (df['孕妇BMI'] >= bmi_cuts[i]) & (df['孕妇BMI'] < bmi_cuts[i+1])
    df_group = df.loc[mask].copy()
    if len(df_group) > 0:
        ci_lower_group = df_group['GAmin_CI'].apply(lambda x: x[0])
        ci_upper_group = df_group['GAmin_CI'].apply(lambda x: x[1])
        group_results.append({
            'BMI组': bmi_group_labels[i],
            'BMI下限': bmi_cuts[i],
            'BMI上限': bmi_cuts[i+1],
            'GAmin均值': df_group['GAmin'].mean(),
            'GAmin_CI下限均值': ci_lower_group.mean(),
            'GAmin_CI上限均值': ci_upper_group.mean(),
            '样本量': len(df_group)
        })

df_group_summary = pd.DataFrame(group_results)
print(df_group_summary)

# 导出到 Excel
df_group_summary.to_excel("BMI组_GAmin_summary.xlsx", index=False)
df.to_excel("individual_GAmin_results.xlsx", index=False)
# -----------------------------
# 4. 导出数据为 CSV
# -----------------------------
# 个体 GAmin 结果
df.to_csv("individual_GAmin_results.csv", index=False, encoding='utf-8-sig')

# BMI组汇总结果
df_group_summary.to_csv("BMI_group_GAmin_summary.csv", index=False, encoding='utf-8-sig')
# -----------------------------
# 4. 导出 CSV 数据
# -----------------------------

# 单个孕妇 GAmin 结果
df.to_csv("individual_GAmin_results.csv", index=False, encoding='utf-8-sig')

# BMI组汇总结果
df_group_summary.to_csv("BMI_group_GAmin_summary.csv", index=False, encoding='utf-8-sig')

print("CSV 文件已生成。")



关键列数据检查：
      GA_num      孕妇BMI    Y染色体浓度
0  11.857143  28.125000  0.025936
1  15.857143  28.515625  0.034887
2  20.142857  28.515625  0.066171
3  22.857143  28.906250  0.061192
4  13.857143  33.331832  0.059230




         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Y染色体浓度   
No. Observations: 1081    Method:             REML     
No. Groups:       267     Scale:              0.0003   
Min. group size:  1       Log-Likelihood:     2511.7149
Max. group size:  8       Converged:          Yes      
Mean group size:  4.0                                  
-------------------------------------------------------
             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------
const         0.070    0.016  4.310 0.000  0.038  0.101
GA_num        0.003    0.000 19.422 0.000  0.003  0.003
孕妇BMI        -0.001    0.001 -2.649 0.008 -0.002 -0.000
Group Var     0.001    0.005                           

BMI分组界点: [np.float64(20.703125), np.float64(29.955556762611845), np.float64(31.164137916444915), np.float64(32.65306223087669), np.float64(34.44960645291812), np.float64(46.875)]
各组最佳检测时点: [np.float64(16.278258433332077), np