In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# 1. 彻底检查和处理数据问题
print("数据基本信息:")
print(df[['孕妇BMI', 'Y染色体浓度']].describe())

# 检查是否有无穷大或异常值
print("\n检查无穷值:")
print("BMI无穷值:", np.isinf(df['孕妇BMI']).sum())
print("Y浓度无穷值:", np.isinf(df['Y染色体浓度']).sum())

print("\n检查NaN值:")
print("BMI NaN值:", df['孕妇BMI'].isnull().sum())
print("Y浓度 NaN值:", df['Y染色体浓度'].isnull().sum())

# 处理任何可能的异常值
df_clean = df.copy()
df_clean['孕妇BMI'] = df_clean['孕妇BMI'].replace([np.inf, -np.inf], np.nan)
df_clean['Y染色体浓度'] = df_clean['Y染色体浓度'].replace([np.inf, -np.inf], np.nan)

# 删除包含NaN的行
df_clean = df_clean.dropna(subset=['孕妇BMI', 'Y染色体浓度'])
print(f"\n清洗后数据量: {len(df_clean)}/{len(df)}")

# 2. 线性回归分析
X = df_clean[['孕妇BMI']]
y = df_clean['Y染色体浓度']

model = LinearRegression()
model.fit(X, y)

a = model.coef_[0]
c = model.intercept_
print(f"回归方程: Y = {a:.6f} * BMI + {c:.6f}")

# 3. 使用分位数进行BMI分组
k = 5
bmi_cutoffs = np.quantile(df_clean['孕妇BMI'], [0.2, 0.4, 0.6, 0.8])
print("BMI分组界点:", bmi_cutoffs)

# 4. 创建分组
df_clean['BMI_group'] = pd.cut(df_clean['孕妇BMI'], 
                              bins=[0] + list(bmi_cutoffs) + [np.inf], 
                              right=False)

# 5. 计算每组的最佳检测时点
def calc_GAmin(Y_target, bmi):
    """计算达到目标Y浓度所需的最小GA"""
    return (Y_target - c - a * bmi) / a

def best_GA_for_group(group_df, Y_target=0.04, coverage=0.95):
    sigma_b = 0.01
    n_sim = 1000
    GA_samples = []
    for _, row in group_df.iterrows():
        bmi = row['孕妇BMI']
        sims = calc_GAmin(Y_target, bmi) + np.random.normal(0, sigma_b, n_sim)
        GA = np.percentile(sims, (1 - coverage) * 100)
        GA_samples.append(GA)
    return np.mean(GA_samples), np.percentile(GA_samples, 2.5), np.percentile(GA_samples, 97.5)

# 6. 计算并输出结果
result_list = []
for name, group in df_clean.groupby('BMI_group'):
    GA_mean, GA_low, GA_high = best_GA_for_group(group)
    result_list.append({
        'BMI_group': str(name),
        'sample_size': len(group),
        'best_GA': GA_mean,
        'CI_low': GA_low,
        'CI_high': GA_high
    })

result_df = pd.DataFrame(result_list)
print("\n最终结果:")
print(result_df)

# 7. 保存结果
result_df.to_excel('NIPT_BMI_group_result.xlsx', index=False)
print("\n结果已保存到 'NIPT_BMI_group_result.xlsx'")

数据基本信息:
             孕妇BMI       Y染色体浓度
count  1082.000000  1082.000000
mean     32.288791     0.077187
std       2.972432     0.033518
min      20.703125     0.010004
25%      30.208806     0.051381
50%      31.811598     0.075066
75%      33.926237     0.098937
max      46.875000     0.234218

检查无穷值:
BMI无穷值: 0
Y浓度无穷值: 0

检查NaN值:
BMI NaN值: 0
Y浓度 NaN值: 0

清洗后数据量: 1082/1082
回归方程: Y = -0.001706 * BMI + 0.132276
BMI分组界点: [29.95648448 31.16443416 32.67554956 34.44531691]


  for name, group in df_clean.groupby('BMI_group'):



最终结果:
          BMI_group  sample_size    best_GA     CI_low    CI_high
0     [0.0, 29.956)          217  25.219908  24.182003  27.294812
1  [29.956, 31.164)          216  23.558623  22.958786  24.079955
2  [31.164, 32.676)          216  22.215133  21.471846  22.867783
3  [32.676, 34.445)          216  20.570890  19.731047  21.351637
4     [34.445, inf)          217  17.339131  10.319843  19.556189

结果已保存到 'NIPT_BMI_group_result.xlsx'


In [2]:
print("缺失值统计:")
print(df[['孕妇BMI', 'Y染色体浓度']].isnull().sum())

# 方法1：删除含有缺失值的行（如果数据量足够大）
df_clean = df.dropna(subset=['孕妇BMI', 'Y染色体浓度'])


缺失值统计:
孕妇BMI     0
Y染色体浓度    0
dtype: int64


In [9]:
import pandas as pd
import numpy as np
from scipy.optimize import root_scalar
from scipy.stats import norm

# -------------------------------
# 1. 读取数据
# -------------------------------
df = pd.read_excel('male.xlsx')

# 确认列名一致
# df.columns

# -------------------------------
# 2. 计算Y染色体达标时间
# 模型: Yij = f(GAij, BMIi, β) + bi + ϵij
# 我们用线性近似 f(GA,BMI)=a*GA+b*BMI+c
# -------------------------------
# 线性近似拟合 Y ~ GA + BMI
from sklearn.linear_model import LinearRegression

X = df[['检测孕周', '孕妇BMI']]
y = df['Y染色体浓度']

model = LinearRegression()
model.fit(X, y)

a, b = model.coef_
c = model.intercept_

# 定义个体达标函数
def calc_GAmin(Y_target, BMIi):
    # Y = a*GA + b*BMI + c => GA = (Y - b*BMI - c)/a
    return (Y_target - b*BMIi - c)/a

df['GA_min'] = df['孕妇BMI'].apply(lambda bmi: calc_GAmin(0.04, bmi))
# 对未达标设定右截断 25 周
df['GA_min'] = df['GA_min'].apply(lambda x: x if x <= 25 else 25)

# -------------------------------
# 3. BMI分组动态规划优化
# -------------------------------
def dp_bmi_group(df, k):
    # 排序BMI
    df_sorted = df.sort_values('孕妇BMI').reset_index(drop=True)
    BMI = df_sorted['孕妇BMI'].values
    GAmin = df_sorted['GA_min'].values
    n = len(BMI)
    
    # 预计算区间方差
    var_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            var_matrix[i,j] = np.var(GAmin[i:j+1])
    
    # DP初始化
    dp = np.full((k, n), np.inf)
    track = np.zeros((k, n), dtype=int)
    
    for i in range(n):
        dp[0,i] = var_matrix[0,i]
    
    for group in range(1, k):
        for i in range(group, n):
            for j in range(group-1, i):
                cost = dp[group-1,j] + var_matrix[j+1,i]
                if cost < dp[group,i]:
                    dp[group,i] = cost
                    track[group,i] = j
    
    # 回溯分组界点
    borders = []
    idx = n-1
    for group in reversed(range(1, k)):
        j = track[group, idx]
        borders.append(BMI[j])
        idx = j
    borders = sorted(borders)
    return borders

# 假设 k=5
k = 5
bmi_cutoffs = dp_bmi_group(df, k)
print("BMI分组界点:", bmi_cutoffs)

# -------------------------------
# 4. 计算每组最佳检测时点
# -------------------------------
df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=[0]+bmi_cutoffs+[np.inf], right=False)

def best_GA_for_group(group_df, Y_target=0.04, coverage=0.95):
    # 模拟随机效应 bi ~ N(0, σb^2)
    # 简化假设 σb = 0.01
    sigma_b = 0.01
    n_sim = 1000
    GA_samples = []
    for _, row in group_df.iterrows():
        bmi = row['孕妇BMI']
        sims = calc_GAmin(Y_target, bmi) + np.random.normal(0, sigma_b, n_sim)
        # 取使得达标概率 >= coverage 的最小GA
        GA = np.percentile(sims, (1-coverage)*100)
        GA_samples.append(GA)
    return np.mean(GA_samples), np.percentile(GA_samples, 2.5), np.percentile(GA_samples, 97.5)

result_list = []
for name, group in df.groupby('BMI_group'):
    GA_mean, GA_low, GA_high = best_GA_for_group(group)
    result_list.append({
        'BMI_group': str(name),
        'sample_size': len(group),
        'best_GA': GA_mean,
        'CI_low': GA_low,
        'CI_high': GA_high
    })

result_df = pd.DataFrame(result_list)
print(result_df)

# -------------------------------
# 5. 保存结果
# -------------------------------
result_df.to_excel('NIPT_BMI_group_result.xlsx', index=False)


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# -------------------------------
# 1. 数据准备和回归分析
# -------------------------------
# 假设df已经包含数据
print("数据基本信息:")
print(df.info())
print("\n前5行数据:")
print(df.head())

# 检查是否有无穷大值
print("\n无穷值检查:")
print(np.isinf(df[['孕妇BMI', 'Y染色体浓度']]).sum())

# 确保数据类型正确
X = df[['孕妇BMI']].astype(float)  # 明确转换为float类型
y = df['Y染色体浓度'].astype(float)

# 再次检查是否有NaN或inf
print("处理后的缺失值统计:", X.isnull().sum().sum(), y.isnull().sum())
print("处理后的无穷值统计:", np.isinf(X).sum().sum(), np.isinf(y).sum())

# 如果有无穷值，处理它们
if np.isinf(X).sum().sum() > 0 or np.isinf(y).sum() > 0:
    # 替换无穷值为NaN然后填充
    X = X.replace([np.inf, -np.inf], np.nan)
    y = y.replace([np.inf, -np.inf], np.nan)
    # 使用中位数填充
    X = X.fillna(X.median())
    y = y.fillna(y.median())

model = LinearRegression()
model.fit(X, y)

# 修正这里：model.coef_ 是一个数组，不是两个值
a = model.coef_[0]  # 获取第一个系数
c = model.intercept_
print(f"回归方程: Y = {a:.6f} * BMI + {c:.6f}")

# -------------------------------
# 2. 定义计算最小GA的函数
# -------------------------------
def calc_GAmin(Y_target, bmi):
    """计算达到目标Y浓度所需的最小GA"""
    return (Y_target - c - a * bmi) / a  # 修正：使用a而不是b

# -------------------------------
# 3. 动态规划分组（保持原有代码）
# -------------------------------
def dp_bmi_group(df, k):
    n = len(df)
    BMI = df['孕妇BMI'].values
    
    # 计算方差矩阵
    var_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            var_matrix[i,j] = np.var(BMI[i:j+1])
    
    # DP初始化
    dp = np.full((k, n), np.inf)
    track = np.zeros((k, n), dtype=int)
    
    for i in range(n):
        dp[0,i] = var_matrix[0,i]
    
    for group in range(1, k):
        for i in range(group, n):
            for j in range(group-1, i):
                cost = dp[group-1,j] + var_matrix[j+1,i]
                if cost < dp[group,i]:
                    dp[group,i] = cost
                    track[group,i] = j
    
    # 回溯分组界点
    borders = []
    idx = n-1
    for group in reversed(range(1, k)):
        j = track[group, idx]
        borders.append(BMI[j])
        idx = j
    borders = sorted(borders)
    return borders

# 假设 k=5
k = 5
bmi_cutoffs = dp_bmi_group(df, k)
print("BMI分组界点:", bmi_cutoffs)

# -------------------------------
# 4. 计算每组最佳检测时点
# -------------------------------
df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=[0]+bmi_cutoffs+[np.inf], right=False)

def best_GA_for_group(group_df, Y_target=0.04, coverage=0.95):
    # 模拟随机效应 bi ~ N(0, σb^2)
    # 简化假设 σb = 0.01
    sigma_b = 0.01
    n_sim = 1000
    GA_samples = []
    for _, row in group_df.iterrows():
        bmi = row['孕妇BMI']
        sims = calc_GAmin(Y_target, bmi) + np.random.normal(0, sigma_b, n_sim)
        # 取使得达标概率 >= coverage 的最小GA
        GA = np.percentile(sims, (1-coverage)*100)
        GA_samples.append(GA)
    return np.mean(GA_samples), np.percentile(GA_samples, 2.5), np.percentile(GA_samples, 97.5)

result_list = []
for name, group in df.groupby('BMI_group'):
    GA_mean, GA_low, GA_high = best_GA_for_group(group)
    result_list.append({
        'BMI_group': str(name),
        'sample_size': len(group),
        'best_GA': GA_mean,
        'CI_low': GA_low,
        'CI_high': GA_high
    })

result_df = pd.DataFrame(result_list)
print(result_df)

# -------------------------------
# 5. 保存结果
# -------------------------------
result_df.to_excel('NIPT_BMI_group_result.xlsx', index=False)

数据基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1082 entries, 0 to 1081
Data columns (total 31 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   序号            1082 non-null   int64  
 1   孕妇代码          1082 non-null   object 
 2   年龄            1082 non-null   int64  
 3   身高            1082 non-null   float64
 4   体重            1082 non-null   float64
 5   末次月经          1082 non-null   object 
 6   IVF妊娠         1082 non-null   object 
 7   检测日期          1082 non-null   object 
 8   检测抽血次数        1082 non-null   int64  
 9   检测孕周          1081 non-null   float64
 10  孕妇BMI         1082 non-null   float64
 11  原始读段数         1082 non-null   int64  
 12  在参考基因组上比对的比例  1082 non-null   float64
 13  重复读段的比例       1082 non-null   float64
 14  唯一比对的读段数      1082 non-null   int64  
 15  GC含量          1082 non-null   float64
 16  13号染色体的Z值     1082 non-null   float64
 17  18号染色体的Z值     1082 non-null   float64
 18  21号染色体的Z值     1082 n

ValueError: Bin edges must be unique: Index([0.0, 28.125, 28.515625, 28.515625, 28.90625, inf], dtype='float64').
You can drop duplicate edges by setting the 'duplicates' kwarg

In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# -------------------------------
# 1. 数据准备和回归分析
# -------------------------------
# 假设df已经包含数据
print("数据基本信息:")
print(df.info())
print("\n前5行数据:")
print(df.head())

# 确保数据类型正确
X = df[['孕妇BMI']].astype(float)
y = df['Y染色体浓度'].astype(float)

model = LinearRegression()
model.fit(X, y)

a = model.coef_[0]
c = model.intercept_
print(f"回归方程: Y = {a:.6f} * BMI + {c:.6f}")

# -------------------------------
# 2. 定义计算最小GA的函数
# -------------------------------
def calc_GAmin(Y_target, bmi):
    """计算达到目标Y浓度所需的最小GA"""
    return (Y_target - c - a * bmi) / a

# -------------------------------
# 3. 动态规划分组
# -------------------------------
def dp_bmi_group(df, k):
    n = len(df)
    BMI = df['孕妇BMI'].values
    
    # 计算方差矩阵
    var_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            var_matrix[i,j] = np.var(BMI[i:j+1])
    
    # DP初始化
    dp = np.full((k, n), np.inf)
    track = np.zeros((k, n), dtype=int)
    
    for i in range(n):
        dp[0,i] = var_matrix[0,i]
    
    for group in range(1, k):
        for i in range(group, n):
            for j in range(group-1, i):
                cost = dp[group-1,j] + var_matrix[j+1,i]
                if cost < dp[group,i]:
                    dp[group,i] = cost
                    track[group,i] = j
    
    # 回溯分组界点
    borders = []
    idx = n-1
    for group in reversed(range(1, k)):
        j = track[group, idx]
        borders.append(BMI[j])
        idx = j
    borders = sorted(borders)
    return borders

# 假设 k=5
k = 5
try:
    bmi_cutoffs = dp_bmi_group(df, k)
    print("BMI分组界点:", bmi_cutoffs)
    
    # -------------------------------
    # 4. 计算每组最佳检测时点
    # -------------------------------
    df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=[0]+bmi_cutoffs+[np.inf], right=False)
    
    def best_GA_for_group(group_df, Y_target=0.04, coverage=0.95):
        # 模拟随机效应 bi ~ N(0, σb^2)
        # 简化假设 σb = 0.01
        sigma_b = 0.01
        n_sim = 1000
        GA_samples = []
        for _, row in group_df.iterrows():
            bmi = row['孕妇BMI']
            sims = calc_GAmin(Y_target, bmi) + np.random.normal(0, sigma_b, n_sim)
            # 取使得达标概率 >= coverage 的最小GA
            GA = np.percentile(sims, (1-coverage)*100)
            GA_samples.append(GA)
        return np.mean(GA_samples), np.percentile(GA_samples, 2.5), np.percentile(GA_samples, 97.5)
    
    result_list = []
    for name, group in df.groupby('BMI_group'):
        GA_mean, GA_low, GA_high = best_GA_for_group(group)
        result_list.append({
            'BMI_group': str(name),
            'sample_size': len(group),
            'best_GA': GA_mean,
            'CI_low': GA_low,
            'CI_high': GA_high
        })
    
    result_df = pd.DataFrame(result_list)
    print("\n分组结果:")
    print(result_df)
    
    # -------------------------------
    # 5. 保存结果
    # -------------------------------
    result_df.to_excel('NIPT_BMI_group_result.xlsx', index=False)
    print("\n结果已保存到 'NIPT_BMI_group_result.xlsx'")
    
except Exception as e:
    print(f"运行过程中出现错误: {e}")
    print("尝试检查数据或调整参数")

数据基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1082 entries, 0 to 1081
Data columns (total 31 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   序号            1082 non-null   int64  
 1   孕妇代码          1082 non-null   object 
 2   年龄            1082 non-null   int64  
 3   身高            1082 non-null   float64
 4   体重            1082 non-null   float64
 5   末次月经          1082 non-null   object 
 6   IVF妊娠         1082 non-null   object 
 7   检测日期          1082 non-null   object 
 8   检测抽血次数        1082 non-null   int64  
 9   检测孕周          1081 non-null   float64
 10  孕妇BMI         1082 non-null   float64
 11  原始读段数         1082 non-null   int64  
 12  在参考基因组上比对的比例  1082 non-null   float64
 13  重复读段的比例       1082 non-null   float64
 14  唯一比对的读段数      1082 non-null   int64  
 15  GC含量          1082 non-null   float64
 16  13号染色体的Z值     1082 non-null   float64
 17  18号染色体的Z值     1082 non-null   float64
 18  21号染色体的Z值     1082 n

In [6]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import time

# -------------------------------
# 1. 数据准备和回归分析
# -------------------------------
print("开始数据分析...")
X = df[['孕妇BMI']].astype(float)
y = df['Y染色体浓度'].astype(float)

model = LinearRegression()
model.fit(X, y)

a = model.coef_[0]
c = model.intercept_
print(f"回归方程: Y = {a:.6f} * BMI + {c:.6f}")

# -------------------------------
# 2. 定义计算最小GA的函数
# -------------------------------
def calc_GAmin(Y_target, bmi):
    """计算达到目标Y浓度所需的最小GA"""
    return (Y_target - c - a * bmi) / a

# -------------------------------
# 3. 动态规划分组（优化版本）
# -------------------------------
def dp_bmi_group(df, k):
    print(f"开始动态规划分组，k={k}，数据量={len(df)}")
    start_time = time.time()
    
    n = len(df)
    BMI = df['孕妇BMI'].values
    
    # 计算方差矩阵（优化：只计算上三角部分）
    print("计算方差矩阵...")
    var_matrix = np.zeros((n, n))
    for i in range(n):
        if i % 100 == 0:  # 每100行显示进度
            print(f"计算方差矩阵进度: {i}/{n}")
        for j in range(i, n):
            var_matrix[i,j] = np.var(BMI[i:j+1])
    
    # DP初始化
    dp = np.full((k, n), np.inf)
    track = np.zeros((k, n), dtype=int)
    
    # 初始化第一组
    for i in range(n):
        dp[0,i] = var_matrix[0,i]
    
    # 动态规划
    print("进行动态规划计算...")
    for group in range(1, k):
        print(f"处理第 {group+1} 组...")
        for i in range(group, n):
            if i % 100 == 0:  # 每100个显示进度
                print(f"  进度: {i}/{n}")
            for j in range(group-1, i):
                cost = dp[group-1,j] + var_matrix[j+1,i]
                if cost < dp[group,i]:
                    dp[group,i] = cost
                    track[group,i] = j
    
    # 回溯分组界点
    borders = []
    idx = n-1
    for group in reversed(range(1, k)):
        j = track[group, idx]
        borders.append(BMI[j])
        idx = j
    
    borders = sorted(borders)
    end_time = time.time()
    print(f"动态规划完成，耗时: {end_time - start_time:.2f} 秒")
    return borders

# 尝试运行动态规划
try:
    k = 5
    bmi_cutoffs = dp_bmi_group(df, k)
    print("BMI分组界点:", bmi_cutoffs)
    
    # -------------------------------
    # 4. 计算每组最佳检测时点
    # -------------------------------
    print("开始计算最佳检测时点...")
    df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=[0] + bmi_cutoffs + [np.inf], right=False)
    
    def best_GA_for_group(group_df, Y_target=0.04, coverage=0.95):
        sigma_b = 0.01
        n_sim = 1000
        GA_samples = []
        for _, row in group_df.iterrows():
            bmi = row['孕妇BMI']
            sims = calc_GAmin(Y_target, bmi) + np.random.normal(0, sigma_b, n_sim)
            GA = np.percentile(sims, (1 - coverage) * 100)
            GA_samples.append(GA)
        return np.mean(GA_samples), np.percentile(GA_samples, 2.5), np.percentile(GA_samples, 97.5)
    
    result_list = []
    for name, group in df.groupby('BMI_group'):
        print(f"处理分组: {name}")
        GA_mean, GA_low, GA_high = best_GA_for_group(group)
        result_list.append({
            'BMI_group': str(name),
            'sample_size': len(group),
            'best_GA': GA_mean,
            'CI_low': GA_low,
            'CI_high': GA_high
        })
    
    result_df = pd.DataFrame(result_list)
    print("\n分组结果:")
    print(result_df)
    
    # -------------------------------
    # 5. 保存结果
    # -------------------------------
    result_df.to_excel('NIPT_BMI_group_result.xlsx', index=False)
    print("\n结果已保存到 'NIPT_BMI_group_result.xlsx'")
    
except Exception as e:
    print(f"运行过程中出现错误: {e}")
    import traceback
    traceback.print_exc()
    
    # 如果动态规划太慢，尝试简化版本
    print("\n尝试使用简化分组方法...")
    try:
        # 使用分位数进行简单分组
        bmi_cutoffs = np.quantile(df['孕妇BMI'], [0.2, 0.4, 0.6, 0.8])
        print("使用分位数分组界点:", bmi_cutoffs)
        
        df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=[0] + list(bmi_cutoffs) + [np.inf], right=False)
        
        result_list = []
        for name, group in df.groupby('BMI_group'):
            print(f"处理分组: {name}")
            GA_mean, GA_low, GA_high = best_GA_for_group(group)
            result_list.append({
                'BMI_group': str(name),
                'sample_size': len(group),
                'best_GA': GA_mean,
                'CI_low': GA_low,
                'CI_high': GA_high
            })
        
        result_df = pd.DataFrame(result_list)
        print("\n简化分组结果:")
        print(result_df)
        result_df.to_excel('NIPT_BMI_group_result_simple.xlsx', index=False)
        print("\n简化结果已保存到 'NIPT_BMI_group_result_simple.xlsx'")
        
    except Exception as e2:
        print(f"简化方法也失败: {e2}")

开始数据分析...
回归方程: Y = -0.001706 * BMI + 0.132276
开始动态规划分组，k=5，数据量=1082
计算方差矩阵...
计算方差矩阵进度: 0/1082
计算方差矩阵进度: 100/1082
计算方差矩阵进度: 200/1082
计算方差矩阵进度: 300/1082
计算方差矩阵进度: 400/1082
计算方差矩阵进度: 500/1082
计算方差矩阵进度: 600/1082
计算方差矩阵进度: 700/1082
计算方差矩阵进度: 800/1082
计算方差矩阵进度: 900/1082
计算方差矩阵进度: 1000/1082
进行动态规划计算...
处理第 2 组...
  进度: 100/1082
  进度: 200/1082
  进度: 300/1082
  进度: 400/1082
  进度: 500/1082
  进度: 600/1082
  进度: 700/1082
  进度: 800/1082
  进度: 900/1082
  进度: 1000/1082
处理第 3 组...
  进度: 100/1082
  进度: 200/1082
  进度: 300/1082
  进度: 400/1082
  进度: 500/1082
  进度: 600/1082
  进度: 700/1082
  进度: 800/1082
  进度: 900/1082
  进度: 1000/1082
处理第 4 组...
  进度: 100/1082
  进度: 200/1082
  进度: 300/1082
  进度: 400/1082
  进度: 500/1082
  进度: 600/1082
  进度: 700/1082
  进度: 800/1082
  进度: 900/1082
  进度: 1000/1082
处理第 5 组...
  进度: 100/1082
  进度: 200/1082
  进度: 300/1082
  进度: 400/1082
  进度: 500/1082
  进度: 600/1082
  进度: 700/1082
  进度: 800/1082
  进度: 900/1082
  进度: 1000/1082
动态规划完成，耗时: 35.71 秒
BMI分组界点: [np.float64(28.125), np.f

Traceback (most recent call last):
  File "C:\Users\Lenovo\AppData\Local\Temp\ipykernel_14640\3756536702.py", line 90, in <module>
    df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=[0] + bmi_cutoffs + [np.inf], right=False)
                      ~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\AppData\Roaming\Python\Python313\site-packages\pandas\core\reshape\tile.py", line 257, in cut
    fac, bins = _bins_to_cuts(
                ~~~~~~~~~~~~~^
        x_idx,
        ^^^^^^
    ...<6 lines>...
        ordered=ordered,
        ^^^^^^^^^^^^^^^^
    )
    ^
  File "C:\Users\Lenovo\AppData\Roaming\Python\Python313\site-packages\pandas\core\reshape\tile.py", line 443, in _bins_to_cuts
    raise ValueError(
    ...<2 lines>...
    )
ValueError: Bin edges must be unique: Index([0.0, 28.125, 28.515625, 28.515625, 28.90625, inf], dtype='float64').
You can drop duplicate edges by setting the 'duplicates' kwarg
  for name, group in df.groupby('BMI_group

In [10]:
bmi_bins = [0] + sorted(set(bmi_cutoffs)) + [np.inf]
df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=bmi_bins, right=False)


In [11]:
bmi_cutoffs_unique = sorted(list(set(bmi_cutoffs)))
df['BMI_group'] = pd.cut(df['孕妇BMI'],
                         bins=[0]+bmi_cutoffs_unique+[np.inf],
                         right=False)


In [12]:
import pandas as pd
import numpy as np
from scipy.optimize import root_scalar
from scipy.stats import norm

# -------------------------------
# 1. 读取数据
# -------------------------------
df = pd.read_excel('male.xlsx')

# 确认列名一致
# df.columns

# -------------------------------
# 2. 计算Y染色体达标时间
# 模型: Yij = f(GAij, BMIi, β) + bi + ϵij
# 我们用线性近似 f(GA,BMI)=a*GA+b*BMI+c
# -------------------------------
# 线性近似拟合 Y ~ GA + BMI
from sklearn.linear_model import LinearRegression

X = df[['检测孕周', '孕妇BMI']]
y = df['Y染色体浓度']

model = LinearRegression()
model.fit(X, y)

a, b = model.coef_
c = model.intercept_

# 定义个体达标函数
def calc_GAmin(Y_target, BMIi):
    # Y = a*GA + b*BMI + c => GA = (Y - b*BMI - c)/a
    return (Y_target - b*BMIi - c)/a

df['GA_min'] = df['孕妇BMI'].apply(lambda bmi: calc_GAmin(0.04, bmi))
# 对未达标设定右截断 25 周
df['GA_min'] = df['GA_min'].apply(lambda x: x if x <= 25 else 25)

# -------------------------------
# 3. BMI分组动态规划优化
# -------------------------------
def dp_bmi_group(df, k):
    # 排序BMI
    df_sorted = df.sort_values('孕妇BMI').reset_index(drop=True)
    BMI = df_sorted['孕妇BMI'].values
    GAmin = df_sorted['GA_min'].values
    n = len(BMI)
    
    # 预计算区间方差
    var_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            var_matrix[i,j] = np.var(GAmin[i:j+1])
    
    # DP初始化
    dp = np.full((k, n), np.inf)
    track = np.zeros((k, n), dtype=int)
    
    for i in range(n):
        dp[0,i] = var_matrix[0,i]
    
    for group in range(1, k):
        for i in range(group, n):
            for j in range(group-1, i):
                cost = dp[group-1,j] + var_matrix[j+1,i]
                if cost < dp[group,i]:
                    dp[group,i] = cost
                    track[group,i] = j
    
    # 回溯分组界点
    borders = []
    idx = n-1
    for group in reversed(range(1, k)):
        j = track[group, idx]
        borders.append(BMI[j])
        idx = j
    borders = sorted(borders)
    return borders

# 假设 k=5
k = 5
bmi_cutoffs = dp_bmi_group(df, k)
print("BMI分组界点:", bmi_cutoffs)

# -------------------------------
# 4. 计算每组最佳检测时点
# -------------------------------
df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=[0]+bmi_cutoffs+[np.inf], right=False)

def best_GA_for_group(group_df, Y_target=0.04, coverage=0.95):
    # 模拟随机效应 bi ~ N(0, σb^2)
    # 简化假设 σb = 0.01
    sigma_b = 0.01
    n_sim = 1000
    GA_samples = []
    for _, row in group_df.iterrows():
        bmi = row['孕妇BMI']
        sims = calc_GAmin(Y_target, bmi) + np.random.normal(0, sigma_b, n_sim)
        # 取使得达标概率 >= coverage 的最小GA
        GA = np.percentile(sims, (1-coverage)*100)
        GA_samples.append(GA)
    return np.mean(GA_samples), np.percentile(GA_samples, 2.5), np.percentile(GA_samples, 97.5)

result_list = []
for name, group in df.groupby('BMI_group'):
    GA_mean, GA_low, GA_high = best_GA_for_group(group)
    result_list.append({
        'BMI_group': str(name),
        'sample_size': len(group),
        'best_GA': GA_mean,
        'CI_low': GA_low,
        'CI_high': GA_high
    })

result_df = pd.DataFrame(result_list)
print(result_df)

# -------------------------------
# 5. 保存结果
# -------------------------------
result_df.to_excel('NIPT_BMI_group_result.xlsx', index=False)


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values