In [1]:
df['检测孕周'] = df['检测孕周'].fillna(df['检测孕周'].mean())


NameError: name 'df' is not defined

In [2]:
import pandas as pd

# 读取 male.xlsx
df = pd.read_excel("male.xlsx")

# 填充检测孕周中的 NaN
df['检测孕周'] = df['检测孕周'].fillna(df['检测孕周'].mean())

# 查看缺失值情况
print(df[['检测孕周', '孕妇BMI', 'Y染色体浓度']].isna().sum())


检测孕周      0
孕妇BMI     0
Y染色体浓度    0
dtype: int64


In [3]:
import pandas as pd
import numpy as np
from scipy.optimize import root_scalar
from scipy.stats import norm

# -------------------------------
# 1. 读取数据
# -------------------------------
df = pd.read_excel('male.xlsx')

# 确认列名一致
# df.columns

# -------------------------------
# 2. 计算Y染色体达标时间
# 模型: Yij = f(GAij, BMIi, β) + bi + ϵij
# 我们用线性近似 f(GA,BMI)=a*GA+b*BMI+c
# -------------------------------
# 线性近似拟合 Y ~ GA + BMI
from sklearn.linear_model import LinearRegression

X = df[['检测孕周', '孕妇BMI']]
y = df['Y染色体浓度']

model = LinearRegression()
model.fit(X, y)

a, b = model.coef_
c = model.intercept_

# 定义个体达标函数
def calc_GAmin(Y_target, BMIi):
    # Y = a*GA + b*BMI + c => GA = (Y - b*BMI - c)/a
    return (Y_target - b*BMIi - c)/a

df['GA_min'] = df['孕妇BMI'].apply(lambda bmi: calc_GAmin(0.04, bmi))
# 对未达标设定右截断 25 周
df['GA_min'] = df['GA_min'].apply(lambda x: x if x <= 25 else 25)

# -------------------------------
# 3. BMI分组动态规划优化
# -------------------------------
def dp_bmi_group(df, k):
    # 排序BMI
    df_sorted = df.sort_values('孕妇BMI').reset_index(drop=True)
    BMI = df_sorted['孕妇BMI'].values
    GAmin = df_sorted['GA_min'].values
    n = len(BMI)
    
    # 预计算区间方差
    var_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            var_matrix[i,j] = np.var(GAmin[i:j+1])
    
    # DP初始化
    dp = np.full((k, n), np.inf)
    track = np.zeros((k, n), dtype=int)
    
    for i in range(n):
        dp[0,i] = var_matrix[0,i]
    
    for group in range(1, k):
        for i in range(group, n):
            for j in range(group-1, i):
                cost = dp[group-1,j] + var_matrix[j+1,i]
                if cost < dp[group,i]:
                    dp[group,i] = cost
                    track[group,i] = j
    
    # 回溯分组界点
    borders = []
    idx = n-1
    for group in reversed(range(1, k)):
        j = track[group, idx]
        borders.append(BMI[j])
        idx = j
    borders = sorted(borders)
    return borders

# 假设 k=5
k = 5
bmi_cutoffs = dp_bmi_group(df, k)
print("BMI分组界点:", bmi_cutoffs)

# -------------------------------
# 4. 计算每组最佳检测时点
# -------------------------------
df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=[0]+bmi_cutoffs+[np.inf], right=False)

def best_GA_for_group(group_df, Y_target=0.04, coverage=0.95):
    # 模拟随机效应 bi ~ N(0, σb^2)
    # 简化假设 σb = 0.01
    sigma_b = 0.01
    n_sim = 1000
    GA_samples = []
    for _, row in group_df.iterrows():
        bmi = row['孕妇BMI']
        sims = calc_GAmin(Y_target, bmi) + np.random.normal(0, sigma_b, n_sim)
        # 取使得达标概率 >= coverage 的最小GA
        GA = np.percentile(sims, (1-coverage)*100)
        GA_samples.append(GA)
    return np.mean(GA_samples), np.percentile(GA_samples, 2.5), np.percentile(GA_samples, 97.5)

result_list = []
for name, group in df.groupby('BMI_group'):
    GA_mean, GA_low, GA_high = best_GA_for_group(group)
    result_list.append({
        'BMI_group': str(name),
        'sample_size': len(group),
        'best_GA': GA_mean,
        'CI_low': GA_low,
        'CI_high': GA_high
    })

result_df = pd.DataFrame(result_list)
print(result_df)

# -------------------------------
# 5. 保存结果
# -------------------------------
result_df.to_excel('NIPT_BMI_group_result.xlsx', index=False)


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [3]:
import pandas as pd
import numpy as np
from scipy.optimize import root_scalar
from scipy.stats import norm

# -------------------------------
# 1. 读取数据
# -------------------------------
df = pd.read_excel('male.xlsx')

# 确认列名一致
# df.columns

# -------------------------------
# 2. 计算Y染色体达标时间
# 模型: Yij = f(GAij, BMIi, β) + bi + ϵij
# 我们用线性近似 f(GA,BMI)=a*GA+b*BMI+c
# -------------------------------
# 线性近似拟合 Y ~ GA + BMI
from sklearn.linear_model import LinearRegression

X = df[['检测孕周', '孕妇BMI']]
y = df['Y染色体浓度']

model = LinearRegression()
model.fit(X, y)

a, b = model.coef_
c = model.intercept_

# 定义个体达标函数
def calc_GAmin(Y_target, BMIi):
    # Y = a*GA + b*BMI + c => GA = (Y - b*BMI - c)/a
    return (Y_target - b*BMIi - c)/a

df['GA_min'] = df['孕妇BMI'].apply(lambda bmi: calc_GAmin(0.04, bmi))
# 对未达标设定右截断 25 周
df['GA_min'] = df['GA_min'].apply(lambda x: x if x <= 25 else 25)

# -------------------------------
# 3. BMI分组动态规划优化
# -------------------------------
def dp_bmi_group(df, k):
    # 排序BMI
    df_sorted = df.sort_values('孕妇BMI').reset_index(drop=True)
    BMI = df_sorted['孕妇BMI'].values
    GAmin = df_sorted['GA_min'].values
    n = len(BMI)
    
    # 预计算区间方差
    var_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            var_matrix[i,j] = np.var(GAmin[i:j+1])
    
    # DP初始化
    dp = np.full((k, n), np.inf)
    track = np.zeros((k, n), dtype=int)
    
    for i in range(n):
        dp[0,i] = var_matrix[0,i]
    
    for group in range(1, k):
        for i in range(group, n):
            for j in range(group-1, i):
                cost = dp[group-1,j] + var_matrix[j+1,i]
                if cost < dp[group,i]:
                    dp[group,i] = cost
                    track[group,i] = j
    
    # 回溯分组界点
    borders = []
    idx = n-1
    for group in reversed(range(1, k)):
        j = track[group, idx]
        borders.append(BMI[j])
        idx = j
    borders = sorted(borders)
    return borders

# 假设 k=5
k = 5
bmi_cutoffs = dp_bmi_group(df, k)
print("BMI分组界点:", bmi_cutoffs)

# -------------------------------
# 4. 计算每组最佳检测时点
# -------------------------------
df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=[0]+bmi_cutoffs+[np.inf], right=False)

def best_GA_for_group(group_df, Y_target=0.04, coverage=0.95):
    # 模拟随机效应 bi ~ N(0, σb^2)
    # 简化假设 σb = 0.01
    sigma_b = 0.01
    n_sim = 1000
    GA_samples = []
    for _, row in group_df.iterrows():
        bmi = row['孕妇BMI']
        sims = calc_GAmin(Y_target, bmi) + np.random.normal(0, sigma_b, n_sim)
        # 取使得达标概率 >= coverage 的最小GA
        GA = np.percentile(sims, (1-coverage)*100)
        GA_samples.append(GA)
    return np.mean(GA_samples), np.percentile(GA_samples, 2.5), np.percentile(GA_samples, 97.5)

result_list = []
for name, group in df.groupby('BMI_group'):
    GA_mean, GA_low, GA_high = best_GA_for_group(group)
    result_list.append({
        'BMI_group': str(name),
        'sample_size': len(group),
        'best_GA': GA_mean,
        'CI_low': GA_low,
        'CI_high': GA_high
    })

result_df = pd.DataFrame(result_list)
print(result_df)

# -------------------------------
# 5. 保存结果
# -------------------------------
result_df.to_excel('NIPT_BMI_group_result.xlsx', index=False)


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# 读取数据
df = pd.read_excel("male.xlsx")

# 查看关键列缺失情况
print(df[['检测孕周','孕妇BMI','Y染色体浓度']].isna().sum())

# 删除任意含 NaN 的行
df = df.dropna(subset=['检测孕周','孕妇BMI','Y染色体浓度'])

# 再次检查
print(df[['检测孕周','孕妇BMI','Y染色体浓度']].isna().sum())

# 构建X和y
X = df[['检测孕周','孕妇BMI']]
y = df['Y染色体浓度']

# 检查X中是否还有NaN
print(X.isna().sum())

# 回归拟合
model = LinearRegression()
model.fit(X, y)
a, b = model.coef_
c = model.intercept_
print("回归系数:", a, b, "截距:", c)


检测孕周      1
孕妇BMI     0
Y染色体浓度    0
dtype: int64
检测孕周      0
孕妇BMI     0
Y染色体浓度    0
dtype: int64
检测孕周     0
孕妇BMI    0
dtype: int64
回归系数: 0.0012542235529283952 -0.0019604060947279794 截距: 0.11937564854825122


In [None]:
检测孕周      1
孕妇BMI     0
Y染色体浓度    0
dtype: int64
检测孕周      0
孕妇BMI     0
Y染色体浓度    0
dtype: int64
检测孕周     0
孕妇BMI    0
dtype: int64
回归系数: 0.0012542235529283952 -0.0019604060947279794 截距: 0.11937564854825122


In [5]:
import pandas as pd
import numpy as np
from scipy.optimize import root_scalar
from scipy.stats import norm

# -------------------------------
# 1. 读取数据
# -------------------------------
df = pd.read_excel('male.xlsx')

# 确认列名一致
# df.columns

# -------------------------------
# 2. 计算Y染色体达标时间
# 模型: Yij = f(GAij, BMIi, β) + bi + ϵij
# 我们用线性近似 f(GA,BMI)=a*GA+b*BMI+c
# -------------------------------
# 线性近似拟合 Y ~ GA + BMI
from sklearn.linear_model import LinearRegression

X = df[['检测孕周', '孕妇BMI']]
y = df['Y染色体浓度']

model = LinearRegression()
model.fit(X, y)

a, b = model.coef_
c = model.intercept_

# 定义个体达标函数
def calc_GAmin(Y_target, BMIi):
    # Y = a*GA + b*BMI + c => GA = (Y - b*BMI - c)/a
    return (Y_target - b*BMIi - c)/a

df['GA_min'] = df['孕妇BMI'].apply(lambda bmi: calc_GAmin(0.04, bmi))
# 对未达标设定右截断 25 周
df['GA_min'] = df['GA_min'].apply(lambda x: x if x <= 25 else 25)

# -------------------------------
# 3. BMI分组动态规划优化
# -------------------------------
def dp_bmi_group(df, k):
    # 排序BMI
    df_sorted = df.sort_values('孕妇BMI').reset_index(drop=True)
    BMI = df_sorted['孕妇BMI'].values
    GAmin = df_sorted['GA_min'].values
    n = len(BMI)
    
    # 预计算区间方差
    var_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            var_matrix[i,j] = np.var(GAmin[i:j+1])
    
    # DP初始化
    dp = np.full((k, n), np.inf)
    track = np.zeros((k, n), dtype=int)
    
    for i in range(n):
        dp[0,i] = var_matrix[0,i]
    
    for group in range(1, k):
        for i in range(group, n):
            for j in range(group-1, i):
                cost = dp[group-1,j] + var_matrix[j+1,i]
                if cost < dp[group,i]:
                    dp[group,i] = cost
                    track[group,i] = j
    
    # 回溯分组界点
    borders = []
    idx = n-1
    for group in reversed(range(1, k)):
        j = track[group, idx]
        borders.append(BMI[j])
        idx = j
    borders = sorted(borders)
    return borders

# 假设 k=5
k = 5
bmi_cutoffs = dp_bmi_group(df, k)
print("BMI分组界点:", bmi_cutoffs)

# -------------------------------
# 4. 计算每组最佳检测时点
# -------------------------------
df['BMI_group'] = pd.cut(df['孕妇BMI'], bins=[0]+bmi_cutoffs+[np.inf], right=False)

def best_GA_for_group(group_df, Y_target=0.04, coverage=0.95):
    # 模拟随机效应 bi ~ N(0, σb^2)
    # 简化假设 σb = 0.01
    sigma_b = 0.01
    n_sim = 1000
    GA_samples = []
    for _, row in group_df.iterrows():
        bmi = row['孕妇BMI']
        sims = calc_GAmin(Y_target, bmi) + np.random.normal(0, sigma_b, n_sim)
        # 取使得达标概率 >= coverage 的最小GA
        GA = np.percentile(sims, (1-coverage)*100)
        GA_samples.append(GA)
    return np.mean(GA_samples), np.percentile(GA_samples, 2.5), np.percentile(GA_samples, 97.5)

result_list = []
for name, group in df.groupby('BMI_group'):
    GA_mean, GA_low, GA_high = best_GA_for_group(group)
    result_list.append({
        'BMI_group': str(name),
        'sample_size': len(group),
        'best_GA': GA_mean,
        'CI_low': GA_low,
        'CI_high': GA_high
    })

result_df = pd.DataFrame(result_list)
print(result_df)

# -------------------------------
# 5. 保存结果
# -------------------------------
result_df.to_excel('NIPT_BMI_group_result.xlsx', index=False)


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values