In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import root_scalar
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from tqdm import tqdm

# -----------------------------
# 1. 数据读取与预处理
# -----------------------------
df = pd.read_excel("male.xlsx")

# 转换孕周为浮点数
def ga_to_float(ga_str):
    if isinstance(ga_str, str) and 'w' in ga_str:
        w, d = ga_str.split('w')
        d = d.replace('+','').replace('d','')
        return int(w) + int(d)/7
    else:
        return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)
df = df.dropna(subset=['GA_num', '孕妇BMI', 'Y染色体浓度']).reset_index(drop=True)

# -----------------------------
# 2. 非线性混合效应模型 (NLME) 模拟
# -----------------------------
# 假设非线性函数 f(GA,BMI;beta) = beta0 + beta1*GA^beta2 * exp(-beta3*BMI)
def f_nonlinear(GA, BMI, beta):
    return beta[0] + beta[1]*(GA**beta[2]) * np.exp(-beta[3]*BMI)

# 模拟拟合结果（实际可用 nlme 或 R lme4 拟合）
beta_hat = [0.01, 0.05, 1.5, 0.02]  # 示例固定效应
sigma_b = 0.005  # 个体随机效应标准差
sigma = 0.003    # 残差标准差

# -----------------------------
# 3. 计算每个孕妇达标孕周 (Y >= 4%)
# -----------------------------
GA_max = 25  # 右截断

def solve_ga_min(Y_target, BMI, b_i=0):
    """求解个体最早达标孕周"""
    def func(GA):
        return f_nonlinear(GA, BMI, beta_hat) + b_i - Y_target
    try:
        sol = root_scalar(func, bracket=[5, GA_max], method='brentq')
        return sol.root
    except ValueError:
        return GA_max  # 未达标右截断

df['b_i'] = np.random.normal(0, sigma_b, size=len(df))  # 个体随机效应
df['GA_min'] = df.apply(lambda x: solve_ga_min(0.04, x['孕妇BMI'], x['b_i']), axis=1)

# -----------------------------
# 4. BMI 分组优化 (动态规划)
# -----------------------------
BMI_sorted = np.sort(df['孕妇BMI'].values)
n = len(BMI_sorted)
k = 5  # 组数

# 预计算方差矩阵
Var_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(i, n):
        Var_matrix[i,j] = np.var(df.loc[i:j, 'GA_min'])

# dp[k][i] 表示前 i 个样本分成 k 组的最小总方差
dp = np.full((k+1, n+1), np.inf)
dp[0,0] = 0
prev = np.zeros((k+1, n+1), dtype=int)

for kk in range(1, k+1):
    for i in range(1, n+1):
        for j in range(kk-1, i):
            cost = dp[kk-1,j] + Var_matrix[j,i-1]
            if cost < dp[kk,i]:
                dp[kk,i] = cost
                prev[kk,i] = j

# 回溯得到分组边界
boundaries = []
i = n
for kk in range(k,0,-1):
    j = prev[kk,i]
    boundaries.append(BMI_sorted[j])
    i = j
boundaries = sorted(boundaries)

print("动态规划优化BMI分组边界:", boundaries)

# -----------------------------
# 5. 各组最佳检测时点
# -----------------------------
best_GA = []
for i in range(k):
    if i == 0:
        mask = df['孕妇BMI'] < boundaries[i]
    elif i == k-1:
        mask = df['孕妇BMI'] >= boundaries[i-1]
    else:
        mask = (df['孕妇BMI'] >= boundaries[i-1]) & (df['孕妇BMI'] < boundaries[i])
    GA_group = df.loc[mask,'GA_min']
    best_GA.append(GA_group.mean())

print("各组最佳检测孕周:", best_GA)

# -----------------------------
# 6. 蒙特卡洛测量误差模拟
# -----------------------------
n_sim = 1000
GA_ci = []
sigma_e = 0.002  # 测量误差标准差

for i in range(k):
    mask = (i==0 and df['孕妇BMI'] < boundaries[i]) or \
           (i==k-1 and df['孕妇BMI'] >= boundaries[i-1]) or \
           ((df['孕妇BMI'] >= boundaries[i-1]) & (df['孕妇BMI'] < boundaries[i]))
    GA_vals = df.loc[mask,'GA_min'].values
    GA_sim = []
    for _ in range(n_sim):
        GA_perturbed = GA_vals + np.random.normal(0, sigma_e, size=len(GA_vals))
        GA_sim.append(GA_perturbed.mean())
    GA_ci.append(np.percentile(GA_sim,[2.5,97.5]))

print("各组最佳检测孕周95% CI:", GA_ci)


IndexError: index 0 is out of bounds for axis 0 with size 0

In [17]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from scipy.optimize import minimize

# ===============================
# 1. 导入数据
# ===============================
df = pd.read_excel("male.xlsx")

# 将孕周转成数值
def ga_to_float(ga_str):
    if isinstance(ga_str, str) and 'w' in ga_str:
        w, d = ga_str.split('w')
        d = d.replace('+','').replace('d','0')
        return int(w) + int(d)/7
    return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)
df = df.dropna(subset=['GA_num','孕妇BMI','Y染色体浓度'])

# ===============================
# 2. 拟合混合效应模型 (近似非线性)
# ===============================
endog = df['Y染色体浓度']
exog = sm.add_constant(df[['GA_num','孕妇BMI']])
groups = df['孕妇代码']  # 重复测量按照孕妇代码分组

model = MixedLM(endog, exog, groups=groups)
fit = model.fit()
print(fit.summary())

# ===============================
# 3. 计算个体达标最小GA (GAmin)
# ===============================
threshold = 0.15
beta = fit.params.values
sigma_b = np.sqrt(fit.cov_re.iloc[0,0])
sigma_eps = np.sqrt(fit.scale)

np.random.seed(42)
# 每个样本随机效应bi
unique_groups = df['孕妇代码'].unique()
bi_dict = {gid: np.random.normal(0,sigma_b) for gid in unique_groups}
GAmin_list = []

for idx, row in df.iterrows():
    bi = bi_dict[row['孕妇代码']]
    GAmin = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
    GAmin = np.clip(GAmin, 10, 25)  # 右截断25周
    GAmin_list.append(GAmin)
df['GAmin'] = GAmin_list

# ===============================
# 4. 动态规划优化BMI分组
# ===============================
bmi_sorted = np.sort(df['孕妇BMI'].values)
k = 5  # 分组数

def bmi_variance_cut(cuts):
    cuts = [bmi_sorted.min()] + list(cuts) + [bmi_sorted.max()]
    total_var = 0
    for i in range(k):
        mask = (df['孕妇BMI'] >= cuts[i]) & (df['孕妇BMI'] < cuts[i+1])
        total_var += np.var(df.loc[mask,'GAmin'])
    return total_var

# 初始猜测分位点
x0 = np.percentile(bmi_sorted, [20,40,60,80])
res = minimize(bmi_variance_cut, x0, bounds=[(bmi_sorted.min(),bmi_sorted.max())]*4)
bmi_cuts = [bmi_sorted.min()] + list(res.x) + [bmi_sorted.max()]
print("BMI分组界点:", bmi_cuts)

# ===============================
# 5. 每组最佳检测时点
# ===============================
best_GA = []
for i in range(k):
    mask = (df['孕妇BMI'] >= bmi_cuts[i]) & (df['孕妇BMI'] < bmi_cuts[i+1])
    GA_group = df.loc[mask,'GAmin']
    best_GA.append(GA_group.mean())
print("各组最佳检测时点:", best_GA)

# ===============================
# 6. 蒙特卡洛模拟量化测量误差
# ===============================
n_mc = 1000
sigma_e = 0.002  # 测量误差0.2%
mc_results = []

for _ in range(n_mc):
    Y_obs = df['Y染色体浓度'] + np.random.normal(0,sigma_e,len(df))
    GAmin_mc = []
    for idx, row in df.iterrows():
        bi = bi_dict[row['孕妇代码']]
        GAmin_i = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
        GAmin_i = min(GAmin_i,25)
        GAmin_mc.append(GAmin_i)
    mc_results.append(GAmin_mc)

mc_results = np.array(mc_results)
ci_lower = np.percentile(mc_results,2.5,axis=0)
ci_upper = np.percentile(mc_results,97.5,axis=0)
df['GAmin_CI'] = list(zip(ci_lower, ci_upper))

print(df[['孕妇代码','孕妇BMI','GAmin','GAmin_CI']])


ValueError: zero-size array to reduction operation maximum which has no identity

In [5]:
print(df[['GA_num','孕妇BMI','Y染色体浓度']].info())
print(df[['GA_num','孕妇BMI','Y染色体浓度']].head())


<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GA_num  0 non-null      float64
 1   孕妇BMI   0 non-null      float64
 2   Y染色体浓度  0 non-null      float64
dtypes: float64(3)
memory usage: 0.0 bytes
None
Empty DataFrame
Columns: [GA_num, 孕妇BMI, Y染色体浓度]
Index: []


In [6]:
endog = df['Y染色体浓度']
print(endog.isna().sum())


0


In [7]:
print(df['孕妇代码'].value_counts())


Series([], Name: count, dtype: int64)


In [8]:
endog = df['Y染色体浓度']
print(endog.isna().sum())


0


In [9]:
exog = sm.add_constant(df[['GA_num','孕妇BMI']])
print(exog.isna().sum())


const     0
GA_num    0
孕妇BMI     0
dtype: int64


In [10]:
print(df[['GA_num','孕妇BMI','Y染色体浓度']].info())
print(df[['GA_num','孕妇BMI','Y染色体浓度']].head())


<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GA_num  0 non-null      float64
 1   孕妇BMI   0 non-null      float64
 2   Y染色体浓度  0 non-null      float64
dtypes: float64(3)
memory usage: 0.0 bytes
None
Empty DataFrame
Columns: [GA_num, 孕妇BMI, Y染色体浓度]
Index: []


In [14]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from scipy.optimize import minimize

# ===============================
# 1. 导入数据
# ===============================
df = pd.read_excel("male.xlsx")

# ===============================
# 2. 将孕周转成数值 (更鲁棒版本)
# ===============================
def ga_to_float(ga_str):
    try:
        if isinstance(ga_str, str) and 'w' in ga_str:
            w, d = ga_str.split('w')
            w = w.strip()
            d = d.strip().replace('+','').replace('d','0')
            return int(w) + int(d)/7
        return np.nan
    except:
        return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)

# 检查解析是否成功
print(df[['检测孕周','GA_num']].head(10))

# 删除关键列缺失值
df = df.dropna(subset=['GA_num','孕妇BMI','Y染色体浓度'])
print(df.info())

# ===============================
# 3. 拟合混合效应模型 (近似非线性)
# ===============================
endog = df['Y染色体浓度']
exog = sm.add_constant(df[['GA_num','孕妇BMI']])
groups = df['孕妇代码']  # 重复测量按照孕妇代码分组

model = MixedLM(endog, exog, groups=groups)
fit = model.fit()
print(fit.summary())

# ===============================
# 4. 计算个体达标最小GA (GAmin)
# ===============================
threshold = 0.15  # 达标阈值
beta = fit.params.values
sigma_b = np.sqrt(fit.cov_re.iloc[0,0])
sigma_eps = np.sqrt(fit.scale)

np.random.seed(42)
# 每个孕妇的随机效应bi
unique_groups = df['孕妇代码'].unique()
bi_dict = {gid: np.random.normal(0,sigma_b) for gid in unique_groups}

GAmin_list = []
for idx, row in df.iterrows():
    bi = bi_dict[row['孕妇代码']]
    GAmin = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
    GAmin = np.clip(GAmin, 10, 25)  # 右截断25周
    GAmin_list.append(GAmin)
df['GAmin'] = GAmin_list

# ===============================
# 5. 动态规划优化BMI分组
# ===============================
bmi_sorted = np.sort(df['孕妇BMI'].values)
k = 5  # 分组数

def bmi_variance_cut(cuts):
    cuts = [bmi_sorted.min()] + list(cuts) + [bmi_sorted.max()]
    total_var = 0
    for i in range(k):
        mask = (df['孕妇BMI'] >= cuts[i]) & (df['孕妇BMI'] < cuts[i+1])
        total_var += np.var(df.loc[mask,'GAmin'])
    return total_var

# 初始猜测分位点
x0 = np.percentile(bmi_sorted, [20,40,60,80])
res = minimize(bmi_variance_cut, x0, bounds=[(bmi_sorted.min(),bmi_sorted.max())]*4)
bmi_cuts = [bmi_sorted.min()] + list(res.x) + [bmi_sorted.max()]
print("BMI分组界点:", bmi_cuts)

# ===============================
# 6. 每组最佳检测时点
# ===============================
best_GA = []
for i in range(k):
    mask = (df['孕妇BMI'] >= bmi_cuts[i]) & (df['孕妇BMI'] < bmi_cuts[i+1])
    GA_group = df.loc[mask,'GAmin']
    best_GA.append(GA_group.mean())
print("各组最佳检测时点:", best_GA)

# ===============================
# 7. 蒙特卡洛模拟量化测量误差
# ===============================
n_mc = 1000
sigma_e = 0.002  # 测量误差0.2%
mc_results = []

for _ in range(n_mc):
    Y_obs = df['Y染色体浓度'] + np.random.normal(0,sigma_e,len(df))
    GAmin_mc = []
    for idx, row in df.iterrows():
        bi = bi_dict[row['孕妇代码']]
        GAmin_i = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
        GAmin_i = min(GAmin_i,25)
        GAmin_mc.append(GAmin_i)
    mc_results.append(GAmin_mc)

mc_results = np.array(mc_results)
ci_lower = np.percentile(mc_results,2.5,axis=0)
ci_upper = np.percentile(mc_results,97.5,axis=0)
df['GAmin_CI'] = list(zip(ci_lower, ci_upper))

print(df[['孕妇代码','孕妇BMI','GAmin','GAmin_CI']])


        检测孕周  GA_num
0  11.857143     NaN
1  15.857143     NaN
2  20.142857     NaN
3  22.857143     NaN
4  13.857143     NaN
5  16.714286     NaN
6  19.714286     NaN
7  23.571429     NaN
8  23.571429     NaN
9  13.000000     NaN
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 32 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   序号            0 non-null      int64  
 1   孕妇代码          0 non-null      object 
 2   年龄            0 non-null      int64  
 3   身高            0 non-null      float64
 4   体重            0 non-null      float64
 5   末次月经          0 non-null      object 
 6   IVF妊娠         0 non-null      object 
 7   检测日期          0 non-null      object 
 8   检测抽血次数        0 non-null      int64  
 9   检测孕周          0 non-null      float64
 10  孕妇BMI         0 non-null      float64
 11  原始读段数         0 non-null      int64  
 12  在参考基因组上比对的比例  0 non-null      float64
 13  重复读段的比例       0 non-null      f

ValueError: zero-size array to reduction operation maximum which has no identity

In [15]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from scipy.optimize import minimize

# ===============================
# 1. 导入数据
# ===============================
df = pd.read_excel("male.xlsx")

# ===============================
# 2. 将孕周转成数值 GA_num
# ===============================
def ga_to_float(ga_val):
    try:
        if pd.isna(ga_val):
            return np.nan
        # 已经是数字
        if isinstance(ga_val, (int, float)):
            return float(ga_val)
        # 字符串包含 w/d
        if isinstance(ga_val, str) and 'w' in ga_val:
            w, d = ga_val.split('w')
            w = w.strip()
            d = d.strip().replace('+','').replace('d','0')
            return int(w) + int(d)/7
        # 其他字符串尝试直接转浮点数
        return float(ga_val)
    except:
        return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)

# 删除缺失关键列的数据
df = df.dropna(subset=['GA_num','孕妇BMI','Y染色体浓度'])

print("关键列数据检查：")
print(df[['GA_num','孕妇BMI','Y染色体浓度']].head())

# ===============================
# 3. 拟合混合效应模型
# ===============================
endog = df['Y染色体浓度']
exog = sm.add_constant(df[['GA_num','孕妇BMI']])
groups = df['孕妇代码']

model = MixedLM(endog, exog, groups=groups)
fit = model.fit()
print(fit.summary())

# ===============================
# 4. 计算个体达标最小GA (GAmin)
# ===============================
threshold = 0.15
beta = fit.params.values
sigma_b = np.sqrt(fit.cov_re.iloc[0,0])
sigma_eps = np.sqrt(fit.scale)

np.random.seed(42)
unique_groups = df['孕妇代码'].unique()
bi_dict = {gid: np.random.normal(0,sigma_b) for gid in unique_groups}

GAmin_list = []
for idx, row in df.iterrows():
    bi = bi_dict[row['孕妇代码']]
    GAmin = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
    GAmin = np.clip(GAmin, 10, 25)  # 右截断25周
    GAmin_list.append(GAmin)
df['GAmin'] = GAmin_list

# ===============================
# 5. 动态规划优化BMI分组
# ===============================
bmi_sorted = np.sort(df['孕妇BMI'].values)
k = 5  # 分组数

def bmi_variance_cut(cuts):
    cuts = [bmi_sorted.min()] + list(cuts) + [bmi_sorted.max()]
    total_var = 0
    for i in range(k):
        mask = (df['孕妇BMI'] >= cuts[i]) & (df['孕妇BMI'] < cuts[i+1])
        if mask.sum() > 0:
            total_var += np.var(df.loc[mask,'GAmin'])
    return total_var

x0 = np.percentile(bmi_sorted, [20,40,60,80])
res = minimize(bmi_variance_cut, x0, bounds=[(bmi_sorted.min(),bmi_sorted.max())]*4)
bmi_cuts = [bmi_sorted.min()] + list(res.x) + [bmi_sorted.max()]
print("BMI分组界点:", bmi_cuts)

# ===============================
# 6. 每组最佳检测时点
# ===============================
best_GA = []
for i in range(k):
    mask = (df['孕妇BMI'] >= bmi_cuts[i]) & (df['孕妇BMI'] < bmi_cuts[i+1])
    GA_group = df.loc[mask,'GAmin']
    best_GA.append(GA_group.mean() if len(GA_group) > 0 else np.nan)
print("各组最佳检测时点:", best_GA)

# ===============================
# 7. 蒙特卡洛模拟量化测量误差
# ===============================
n_mc = 1000
sigma_e = 0.002  # 测量误差0.2%
mc_results = []

for _ in range(n_mc):
    Y_obs = df['Y染色体浓度'] + np.random.normal(0,sigma_e,len(df))
    GAmin_mc = []
    for idx, row in df.iterrows():
        bi = bi_dict[row['孕妇代码']]
        GAmin_i = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
        GAmin_i = min(GAmin_i,25)
        GAmin_mc.append(GAmin_i)
    mc_results.append(GAmin_mc)

mc_results = np.array(mc_results)
ci_lower = np.percentile(mc_results,2.5,axis=0)
ci_upper = np.percentile(mc_results,97.5,axis=0)
df['GAmin_CI'] = list(zip(ci_lower, ci_upper))

print(df[['孕妇代码','孕妇BMI','GAmin','GAmin_CI']])


关键列数据检查：
      GA_num      孕妇BMI    Y染色体浓度
0  11.857143  28.125000  0.025936
1  15.857143  28.515625  0.034887
2  20.142857  28.515625  0.066171
3  22.857143  28.906250  0.061192
4  13.857143  33.331832  0.059230




         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Y染色体浓度   
No. Observations: 1081    Method:             REML     
No. Groups:       267     Scale:              0.0003   
Min. group size:  1       Log-Likelihood:     2511.7149
Max. group size:  8       Converged:          Yes      
Mean group size:  4.0                                  
-------------------------------------------------------
             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------
const         0.070    0.016  4.310 0.000  0.038  0.101
GA_num        0.003    0.000 19.422 0.000  0.003  0.003
孕妇BMI        -0.001    0.001 -2.649 0.008 -0.002 -0.000
Group Var     0.001    0.005                           

BMI分组界点: [np.float64(20.703125), np.float64(29.955555715961783), np.float64(31.1641385635714), np.float64(32.653061109846654), np.float64(34.44960730328627), np.float64(46.875)]
各组最佳检测时点: [np.float64(5.066190517215795), np.f

In [13]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from scipy.optimize import minimize

# ===============================
# 1. 导入数据
# ===============================
df = pd.read_excel("male.xlsx")

# 将孕周转成数值
def ga_to_float(ga_str):
    if isinstance(ga_str, str) and 'w' in ga_str:
        w, d = ga_str.split('w')
        d = d.replace('+','').replace('d','0')
        return int(w) + int(d)/7
    return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)
df = df.dropna(subset=['GA_num','孕妇BMI','Y染色体浓度'])

# ===============================
# 2. 拟合混合效应模型
# ===============================
endog = df['Y染色体浓度']
exog = sm.add_constant(df[['GA_num','孕妇BMI']])
groups = df['孕妇代码']

model = MixedLM(endog, exog, groups=groups)
fit = model.fit()
print(fit.summary())

# ===============================
# 3. 计算个体达标最小GA (GAmin)
# ===============================
# ⚠️ 修改这一行 threshold
threshold = 0.15  # 调整后，使 GAmin 落在 10~25 周

beta = fit.params.values
sigma_b = np.sqrt(fit.cov_re.iloc[0,0])
sigma_eps = np.sqrt(fit.scale)

np.random.seed(42)
unique_groups = df['孕妇代码'].unique()
bi_dict = {gid: np.random.normal(0,sigma_b) for gid in unique_groups}

GAmin_list = []
for idx, row in df.iterrows():
    bi = bi_dict[row['孕妇代码']]
    GAmin = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
    GAmin = np.clip(GAmin, 10, 25)  # ⚠️ 左右截断 10~25 周
    GAmin_list.append(GAmin)
df['GAmin'] = GAmin_list

# ===============================
# 4. 动态规划优化BMI分组
# ===============================
bmi_sorted = np.sort(df['孕妇BMI'].values)
k = 5  # 分组数

def bmi_variance_cut(cuts):
    cuts = [bmi_sorted.min()] + list(cuts) + [bmi_sorted.max()]
    total_var = 0
    for i in range(k):
        mask = (df['孕妇BMI'] >= cuts[i]) & (df['孕妇BMI'] < cuts[i+1])
        total_var += np.var(df.loc[mask,'GAmin'])
    return total_var

x0 = np.percentile(bmi_sorted, [20,40,60,80])
res = minimize(bmi_variance_cut, x0, bounds=[(bmi_sorted.min(),bmi_sorted.max())]*4)
bmi_cuts = [bmi_sorted.min()] + list(res.x) + [bmi_sorted.max()]
print("BMI分组界点:", bmi_cuts)

# ===============================
# 5. 每组最佳检测时点
# ===============================
best_GA = []
for i in range(k):
    mask = (df['孕妇BMI'] >= bmi_cuts[i]) & (df['孕妇BMI'] < bmi_cuts[i+1])
    GA_group = df.loc[mask,'GAmin']
    best_GA.append(GA_group.mean())
print("各组最佳检测时点:", best_GA)

# ===============================
# 6. 蒙特卡洛模拟量化测量误差
# ===============================
n_mc = 1000
sigma_e = 0.002  # 测量误差0.2%
mc_results = []

for _ in range(n_mc):
    Y_obs = df['Y染色体浓度'] + np.random.normal(0,sigma_e,len(df))
    GAmin_mc = []
    for idx, row in df.iterrows():
        bi = bi_dict[row['孕妇代码']]
        GAmin_i = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
        GAmin_i = np.clip(GAmin_i, 10, 25)  # 左右截断
        GAmin_mc.append(GAmin_i)
    mc_results.append(GAmin_mc)

mc_results = np.array(mc_results)
ci_lower = np.percentile(mc_results,2.5,axis=0)
ci_upper = np.percentile(mc_results,97.5,axis=0)
df['GAmin_CI'] = list(zip(ci_lower, ci_upper))

print(df[['孕妇代码','孕妇BMI','GAmin','GAmin_CI']])


ValueError: zero-size array to reduction operation maximum which has no identity

In [18]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from scipy.optimize import minimize

# ===============================
# 1. 导入数据
# ===============================
df = pd.read_excel("male.xlsx")

# ===============================
# 2. 将孕周转成数值 GA_num
# ===============================
def ga_to_float(ga_val):
    try:
        if pd.isna(ga_val):
            return np.nan
        # 已经是数字
        if isinstance(ga_val, (int, float)):
            return float(ga_val)
        # 字符串包含 w/d
        if isinstance(ga_val, str) and 'w' in ga_val:
            w, d = ga_val.split('w')
            w = w.strip()
            d = d.strip().replace('+','').replace('d','0')
            return int(w) + int(d)/7
        # 其他字符串尝试直接转浮点数
        return float(ga_val)
    except:
        return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)

# 删除缺失关键列的数据
df = df.dropna(subset=['GA_num','孕妇BMI','Y染色体浓度'])

print("关键列数据检查：")
print(df[['GA_num','孕妇BMI','Y染色体浓度']].head())

# ===============================
# 3. 拟合混合效应模型
# ===============================
endog = df['Y染色体浓度']
exog = sm.add_constant(df[['GA_num','孕妇BMI']])
groups = df['孕妇代码']

model = MixedLM(endog, exog, groups=groups)
fit = model.fit()
print(fit.summary())

# ===============================
# 4. 计算个体达标最小GA (GAmin)
# ===============================
threshold = 0.04
beta = fit.params.values
sigma_b = np.sqrt(fit.cov_re.iloc[0,0])
sigma_eps = np.sqrt(fit.scale)

np.random.seed(42)
unique_groups = df['孕妇代码'].unique()
bi_dict = {gid: np.random.normal(0,sigma_b) for gid in unique_groups}

GAmin_list = []
for idx, row in df.iterrows():
    bi = bi_dict[row['孕妇代码']]
    GAmin = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
    GAmin = min(GAmin, 25)  # 右截断25周
    GAmin_list.append(GAmin)
df['GAmin'] = GAmin_list

# ===============================
# 5. 动态规划优化BMI分组
# ===============================
bmi_sorted = np.sort(df['孕妇BMI'].values)
k = 5  # 分组数

def bmi_variance_cut(cuts):
    cuts = [bmi_sorted.min()] + list(cuts) + [bmi_sorted.max()]
    total_var = 0
    for i in range(k):
        mask = (df['孕妇BMI'] >= cuts[i]) & (df['孕妇BMI'] < cuts[i+1])
        if mask.sum() > 0:
            total_var += np.var(df.loc[mask,'GAmin'])
    return total_var

x0 = np.percentile(bmi_sorted, [20,40,60,80])
res = minimize(bmi_variance_cut, x0, bounds=[(bmi_sorted.min(),bmi_sorted.max())]*4)
bmi_cuts = [bmi_sorted.min()] + list(res.x) + [bmi_sorted.max()]
print("BMI分组界点:", bmi_cuts)

# ===============================
# 6. 每组最佳检测时点
# ===============================
best_GA = []
for i in range(k):
    mask = (df['孕妇BMI'] >= bmi_cuts[i]) & (df['孕妇BMI'] < bmi_cuts[i+1])
    GA_group = df.loc[mask,'GAmin']
    best_GA.append(GA_group.mean() if len(GA_group) > 0 else np.nan)
print("各组最佳检测时点:", best_GA)

# ===============================
# 7. 蒙特卡洛模拟量化测量误差
# ===============================
n_mc = 1000
sigma_e = 0.002  # 测量误差0.2%
mc_results = []

for _ in range(n_mc):
    Y_obs = df['Y染色体浓度'] + np.random.normal(0,sigma_e,len(df))
    GAmin_mc = []
    for idx, row in df.iterrows():
        bi = bi_dict[row['孕妇代码']]
        GAmin_i = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
        GAmin_i = min(GAmin_i,25)
        GAmin_mc.append(GAmin_i)
    mc_results.append(GAmin_mc)

mc_results = np.array(mc_results)
ci_lower = np.percentile(mc_results,2.5,axis=0)
ci_upper = np.percentile(mc_results,97.5,axis=0)
df['GAmin_CI'] = list(zip(ci_lower, ci_upper))

print(df[['孕妇代码','孕妇BMI','GAmin','GAmin_CI']])


关键列数据检查：
      GA_num      孕妇BMI    Y染色体浓度
0  11.857143  28.125000  0.025936
1  15.857143  28.515625  0.034887
2  20.142857  28.515625  0.066171
3  22.857143  28.906250  0.061192
4  13.857143  33.331832  0.059230




         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Y染色体浓度   
No. Observations: 1081    Method:             REML     
No. Groups:       267     Scale:              0.0003   
Min. group size:  1       Log-Likelihood:     2511.7149
Max. group size:  8       Converged:          Yes      
Mean group size:  4.0                                  
-------------------------------------------------------
             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------
const         0.070    0.016  4.310 0.000  0.038  0.101
GA_num        0.003    0.000 19.422 0.000  0.003  0.003
孕妇BMI        -0.001    0.001 -2.649 0.008 -0.002 -0.000
Group Var     0.001    0.005                           

BMI分组界点: [np.float64(20.703125), np.float64(29.955555715961783), np.float64(31.1641385635714), np.float64(32.653061109846654), np.float64(34.44960730328627), np.float64(46.875)]
各组最佳检测时点: [np.float64(5.066190517215795), np.f

In [21]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
from scipy.optimize import minimize

# ===============================
# 1. 导入数据
# ===============================
df = pd.read_excel("male.xlsx")

# ===============================
# 2. 将孕周转成数值 GA_num
# ===============================
def ga_to_float(ga_val):
    try:
        if pd.isna(ga_val):
            return np.nan
        # 已经是数字
        if isinstance(ga_val, (int, float)):
            return float(ga_val)
        # 字符串包含 w/d
        if isinstance(ga_val, str) and 'w' in ga_val:
            w, d = ga_val.split('w')
            w = w.strip()
            d = d.strip().replace('+','').replace('d','0')
            return int(w) + int(d)/7
        # 其他字符串尝试直接转浮点数
        return float(ga_val)
    except:
        return np.nan

df['GA_num'] = df['检测孕周'].apply(ga_to_float)

# 删除缺失关键列的数据
df = df.dropna(subset=['GA_num','孕妇BMI','Y染色体浓度'])

print("关键列数据检查：")
print(df[['GA_num','孕妇BMI','Y染色体浓度']].head())

# ===============================
# 3. 拟合混合效应模型
# ===============================
endog = df['Y染色体浓度']
exog = sm.add_constant(df[['GA_num','孕妇BMI']])
groups = df['孕妇代码']

model = MixedLM(endog, exog, groups=groups)
fit = model.fit()
print(fit.summary())

# ===============================
# 4. 计算个体达标最小GA (GAmin)
# ===============================
threshold = 0.15  # 调整后，使GAmin落在合理孕周范围
beta = fit.params.values
sigma_b = np.sqrt(fit.cov_re.iloc[0,0])
sigma_eps = np.sqrt(fit.scale)

np.random.seed(42)
unique_groups = df['孕妇代码'].unique()
bi_dict = {gid: np.random.normal(0,sigma_b) for gid in unique_groups}

GAmin_list = []
for idx, row in df.iterrows():
    bi = bi_dict[row['孕妇代码']]
    GAmin = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
    GAmin = np.clip(GAmin, 10, 25)  # 左右截断10~25周
    GAmin_list.append(GAmin)
df['GAmin'] = GAmin_list

# ===============================
# 5. 动态规划优化BMI分组
# ===============================
bmi_sorted = np.sort(df['孕妇BMI'].values)
k = 5  # 分组数

def bmi_variance_cut(cuts):
    cuts = [bmi_sorted.min()] + list(cuts) + [bmi_sorted.max()]
    total_var = 0
    for i in range(k):
        mask = (df['孕妇BMI'] >= cuts[i]) & (df['孕妇BMI'] < cuts[i+1])
        if mask.sum() > 0:
            total_var += np.var(df.loc[mask,'GAmin'])
    return total_var

x0 = np.percentile(bmi_sorted, [20,40,60,80])
res = minimize(bmi_variance_cut, x0, bounds=[(bmi_sorted.min(),bmi_sorted.max())]*4)
bmi_cuts = [bmi_sorted.min()] + list(res.x) + [bmi_sorted.max()]
print("BMI分组界点:", bmi_cuts)

# ===============================
# 6. 每组最佳检测时点
# ===============================
best_GA = []
for i in range(k):
    mask = (df['孕妇BMI'] >= bmi_cuts[i]) & (df['孕妇BMI'] < bmi_cuts[i+1])
    GA_group = df.loc[mask,'GAmin']
    best_GA.append(GA_group.mean() if len(GA_group) > 0 else np.nan)
print("各组最佳检测时点:", best_GA)

# ===============================
# 7. 蒙特卡洛模拟量化测量误差
# ===============================
n_mc = 1000
sigma_e = 0.002  # 测量误差0.2%
mc_results = []

for _ in range(n_mc):
    Y_obs = df['Y染色体浓度'] + np.random.normal(0,sigma_e,len(df))
    GAmin_mc = []
    for idx, row in df.iterrows():
        bi = bi_dict[row['孕妇代码']]
        GAmin_i = (threshold - beta[0] - beta[2]*row['孕妇BMI'] - bi) / beta[1]
        GAmin_i = np.clip(GAmin_i, 10, 25)
        GAmin_mc.append(GAmin_i)
    mc_results.append(GAmin_mc)

mc_results = np.array(mc_results)
ci_lower = np.percentile(mc_results,2.5,axis=0)
ci_upper = np.percentile(mc_results,97.5,axis=0)
df['GAmin_CI'] = list(zip(ci_lower, ci_upper))

print(df[['孕妇代码','孕妇BMI','GAmin','GAmin_CI']])
# 保存到 Excel
df[['孕妇代码','孕妇BMI','GAmin','GAmin_CI']].to_excel("GAmin_full.xlsx", index=False)


关键列数据检查：
      GA_num      孕妇BMI    Y染色体浓度
0  11.857143  28.125000  0.025936
1  15.857143  28.515625  0.034887
2  20.142857  28.515625  0.066171
3  22.857143  28.906250  0.061192
4  13.857143  33.331832  0.059230




         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Y染色体浓度   
No. Observations: 1081    Method:             REML     
No. Groups:       267     Scale:              0.0003   
Min. group size:  1       Log-Likelihood:     2511.7149
Max. group size:  8       Converged:          Yes      
Mean group size:  4.0                                  
-------------------------------------------------------
             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------
const         0.070    0.016  4.310 0.000  0.038  0.101
GA_num        0.003    0.000 19.422 0.000  0.003  0.003
孕妇BMI        -0.001    0.001 -2.649 0.008 -0.002 -0.000
Group Var     0.001    0.005                           

BMI分组界点: [np.float64(20.703125), np.float64(29.95555387442907), np.float64(31.16414152214211), np.float64(32.65305904300824), np.float64(34.4496096936085), np.float64(46.875)]
各组最佳检测时点: [np.float64(24.871081917242837), np.fl