删掉负值，随机系数

In [None]:
import numpy as np  
import pandas as pd  
import time

def generate_data(n_samples, p):  
    np.random.seed(1)  
      
    X = np.random.normal(0, 1, size=(n_samples, p))     
    X = np.around(X, 2)    
  
    coefs_T = np.random.uniform(0, 0.5, size=p)    
    coefs_T = np.around(coefs_T, 2)    
    log_odds = np.dot(X, coefs_T) + np.random.uniform(-1, 1, size=n_samples)     
    T_sigmoid = 1/(1 + np.exp(-log_odds))        
    T = np.array([np.random.binomial(1, p) for p in T_sigmoid])    
  
    coefs_TE = np.random.uniform(0, 2, size=p)    
    coefs_TE = np.around(coefs_TE, 2)    
    TE = np.dot(np.maximum(X[:, :], 0), coefs_TE) + np.random.uniform(-1, 1, size=n_samples)    
    TE = np.around(TE, 2)    
    coefs_Y = np.random.uniform(0, 1, size = p)    
    Y = TE * T + np.dot(X, coefs_Y) + np.random.uniform(-1, 1, size=n_samples)      
    Y = np.around(Y, 2)    
    offset =  np.abs(np.min(Y))    
    Y = Y + offset    
  
    e = T_sigmoid    
    wt = np.around(T/e + (1-T)/(1-e), 2)    
  
    df = pd.DataFrame(X, columns=[f'X{i+1}' for i in range(p)])     
    df['t'] = T      
    df['TE'] = TE     
    df['y'] = Y     
    df['e'] = e    
    df['wt'] = wt    
    df['v'] = np.around(wt*Y, 2)    
  
    cols = [f'X{i+1}' for i in range(1, p)]      
    df[cols] = df[cols].astype(float)     
  
    return df  

# 固定一个参数测试

In [None]:
n_samples_list = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000]  
p_list = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150]  
  
# 用于记录结果的字典  
results_n_samples = {"n_samples": [], "time": []}  
results_p = {"p": [], "time": []}  

In [None]:
import baseline

# 固定p，改变n_samples  
fixed_p = 20  # 可以选择你想固定的p值  
for n_samples in n_samples_list:  
    df = generate_data(n_samples, fixed_p)  
    df_ori = df.copy()
    
    covariate_columns = [f'X{i+1}' for i in range(fixed_p)]

    result = baseline.pymoo_opt(df_ori, covariate_columns, "t", 'y', cov_ratio = 0.03, length_limit = 4)
    elapsed_time = result["time"]
  
    results_n_samples["n_samples"].append(n_samples)  
    results_n_samples["time"].append(elapsed_time)  
  
# 固定n_samples，改变p  
fixed_n_samples = 2000  # 可以选择你想固定的n_samples值  
for p in p_list:  
    df = generate_data(fixed_n_samples, p)  
    df_ori = df.copy()

    covariate_columns = [f'X{i+1}' for i in range(p)]

    result = baseline.pymoo_opt(df_ori, covariate_columns, "t", 'y', cov_ratio = 0.03, length_limit = 4)
    elapsed_time = result["time"]  
  
    results_p["p"].append(p)  
    results_p["time"].append(elapsed_time)   
  
# 将结果转换为DataFrame并保存为CSV文件  
results_df_n_samples = pd.DataFrame(results_n_samples)  
results_df_n_samples.to_csv("result/scalability_test_results_n_samples.csv", index=False)  
  
results_df_p = pd.DataFrame(results_p)  
results_df_p.to_csv("result/scalability_test_results_p.csv", index=False)  

In [None]:
import matplotlib.pyplot as plt  
  
# 从CSV文件中读取结果  
results_df_n_samples = pd.read_csv("result/scalability_test_results_n_samples.csv")  
results_df_p = pd.read_csv("result/scalability_test_results_p.csv")  

fig, ax = plt.subplots(2, 1, figsize=(10, 10))  
  
# 子图1：训练时间 vs 数据集大小  
ax[0].plot(results_df_n_samples["n_samples"], results_df_n_samples["time"], marker='o')  
ax[0].set_xlabel('Number of Samples')  
ax[0].set_ylabel('Training Time')  
ax[0].set_title(f'Training Time vs Number of Samples (p={20})')  
  
# 子图2：训练时间 vs 协变量数量  
ax[1].plot(results_df_p["p"], results_df_p["time"], marker='o')  
ax[1].set_xlabel('Number of Covariates')  
ax[1].set_ylabel('Training Time')  
ax[1].set_title(f'Training Time vs Number of Covariates (n_samples={2000})')  
  
plt.tight_layout()  

# 保存图像  
plt.savefig("scalability.svg")  

# 多次测试

In [None]:
import numpy as np  
import pandas as pd  
import time

def generate_data(n_samples, p, seed):  
    np.random.seed(seed)  
      
    X = np.random.normal(0, 1, size=(n_samples, p))     
    X = np.around(X, 2)    
  
    coefs_T = np.random.uniform(0, 0.5, size=p)    
    coefs_T = np.around(coefs_T, 2)    
    log_odds = np.dot(X, coefs_T) + np.random.uniform(-1, 1, size=n_samples)     
    T_sigmoid = 1/(1 + np.exp(-log_odds))        
    T = np.array([np.random.binomial(1, p) for p in T_sigmoid])    
  
    coefs_TE = np.random.uniform(0, 2, size=p)    
    coefs_TE = np.around(coefs_TE, 2)    
    TE = np.dot(np.maximum(X[:, :], 0), coefs_TE) + np.random.uniform(-1, 1, size=n_samples)    
    TE = np.around(TE, 2)    
    coefs_Y = np.random.uniform(0, 1, size = p)    
    Y = TE * T + np.dot(X, coefs_Y) + np.random.uniform(-1, 1, size=n_samples)      
    Y = np.around(Y, 2)    
    offset =  np.abs(np.min(Y))    
    Y = Y + offset    
  
    e = T_sigmoid    
    wt = np.around(T/e + (1-T)/(1-e), 2)    
  
    df = pd.DataFrame(X, columns=[f'X{i+1}' for i in range(p)])     
    df['t'] = T      
    df['TE'] = TE     
    df['y'] = Y     
    df['e'] = e    
    df['wt'] = wt    
    df['v'] = np.around(wt*Y, 2)    
  
    cols = [f'X{i+1}' for i in range(1, p)]      
    df[cols] = df[cols].astype(float)     
  
    return df  

n_samples_list = [1000, 2000, 3000]  
p_list = [10, 12, 14]  
  
# 用于记录结果的字典  
results_n_samples = {"n_samples": [], "time": [], "std": [], "all_times": []}    
results_p = {"p": [], "time": [], "std": [], "all_times": []}  

import sys
sys.path.append("../")
 

# 固定p，改变n_samples    
fixed_p = 20  # 可以选择你想固定的p值    
for n_samples in n_samples_list:    
    times = []  # 用于存储每次实验的时间  
    for i in range(5):  # 进行十次实验  
        df = generate_data(n_samples, fixed_p)  
        df_ori = df.copy()
        
        covariate_columns = [f'X{i+1}' for i in range(fixed_p)]
        
        result = baseline.pymoo_opt(df_ori, covariate_columns, "t", 'y', cov_ratio = 0.03, length_limit = 4)
        elapsed_time = result["time"]    
        times.append(elapsed_time)  # 添加到列表中  
    
    avg_time = np.mean(times)  # 计算平均时间    
    std_time = np.std(times)  # 计算标准差  
    results_n_samples["n_samples"].append(n_samples)      
    results_n_samples["time"].append(avg_time)  # 添加平均时间到结果中  
    results_n_samples["std"].append(std_time)  # 添加标准差到结果中  
    results_n_samples["all_times"].append(times)  # 添加所有时间到结果中  
    
# 固定n_samples，改变p    
fixed_n_samples = 2000  # 可以选择你想固定的n_samples值    
for p in p_list:    
    times = []  # 用于存储每次实验的时间  
    for i in range(5):  # 进行十次实验  
        df = generate_data(n_samples, fixed_p)  
        df_ori = df.copy()
        
        covariate_columns = [f'X{i+1}' for i in range(fixed_p)]
        
        result = baseline.pymoo_opt(df_ori, covariate_columns, "t", 'y', cov_ratio = 0.03, length_limit = 4)
        elapsed_time = result["time"]    
        times.append(elapsed_time)  # 添加到列表中  
    
    avg_time = np.mean(times)  # 计算平均时间    
    std_time = np.std(times)  # 计算标准差  
    results_p["p"].append(p)      
    results_p["time"].append(avg_time)  # 添加平均时间到结果中  
    results_p["std"].append(std_time)  # 添加标准差到结果中  
    results_p["all_times"].append(times)  # 添加所有时间到结果中 
  
# 将结果转换为DataFrame并保存为CSV文件  
results_df_n_samples = pd.DataFrame(results_n_samples)  
results_df_n_samples.to_csv("scalability_test_results_n_samples_demo.csv", index=False)  
  
results_df_p = pd.DataFrame(results_p)  
results_df_p.to_csv("scalability_test_results_p_demo.csv", index=False)  

In [None]:
# 创建两个子图    
fig, ax = plt.subplots(2, 1, figsize=(10, 10))    
  
# 子图1：训练时间 vs 数据集大小    
ax[0].errorbar(results_df_n_samples["n_samples"], results_df_n_samples["time"], yerr=results_df_n_samples["std"], fmt='-o', capsize=5)  
ax[0].set_xlabel('Number of Samples')    
ax[0].set_ylabel('Training Time')    
ax[0].set_title(f'Training Time vs Number of Samples (p={fixed_p})')    
  
# 子图2：训练时间 vs 协变量数量    
ax[1].errorbar(results_df_p["p"], results_df_p["time"], yerr=results_df_p["std"], fmt='-o', capsize=5)  
ax[1].set_xlabel('Number of Covariates')    
ax[1].set_ylabel('Training Time')    
ax[1].set_title(f'Training Time vs Number of Covariates (n_samples={fixed_n_samples})')    
  
plt.tight_layout()    
  
# 保存图像    
plt.savefig("scalability_with_std.svg")    