# 用配方限制的方法隨機生成配方
### PBTc 配方限制
![!image](/workspaces/BO_EXPERIMENTS/src/datasets/PBTc_formula.png)

In [1]:
import os
import pandas as pd
import numpy as np
import joblib
import random
from collections import defaultdict

In [2]:
# 對各個配方系列按照設定隨機生成數值
def generate_key_data(config, target_sum=100):
    multiplier = 10  # 處理小數點後一位
    target_int = int(target_sum * multiplier)
    
    # 初始化：確保 min >= 0.1 (即放大後的 1)
    adj_config = {k: {'min': max(1, int(v['min'] * multiplier)), 
                      'max': int(v['max'] * multiplier)} for k, v in config.items()}
    
    res_int = {k: v['min'] for k, v in adj_config.items()}
    remaining = target_int - sum(res_int.values())
    
    if remaining < 0:
        raise ValueError("設定的最小值總和已超過目標 100")

    keys = list(adj_config.keys())
    while remaining > 0:
        k = random.choice(keys)
        space = adj_config[k]['max'] - res_int[k]
        if space > 0:
            add_val = random.randint(1, min(remaining, space))
            res_int[k] += add_val
            remaining -= add_val
            
    # 回傳以 Key 為單位的結果 (已轉回浮點數)
    return {k: v / multiplier for k, v in res_int.items()}

# 將 Key 值分配給子欄位 (總和等於 Key 值, 允許為 0) ---
def generate_sub_col_data(key_data, cols):
    # 分類子欄位
    col_mapping = defaultdict(list)
    for col in cols:
        prefix = col[:2]
        col_mapping[prefix].append(col)
    
    final_values = {}
    
    for prefix, total_val in key_data.items():
        sub_cols = col_mapping.get(prefix, [])
        if not sub_cols:
            continue
        
        # 轉成整數處理以確保精確度 (10倍)
        total_int = int(round(total_val * 10))
        
        if len(sub_cols) == 1:
            final_values[sub_cols[0]] = total_val
        else:
            # 使用隔板法分配
            cuts = sorted([random.randint(0, total_int) for _ in range(len(sub_cols) - 1)])
            cuts = [0] + cuts + [total_int]
            
            for i in range(len(sub_cols)):
                val = (cuts[i+1] - cuts[i]) / 10
                final_values[sub_cols[i]] = val
                
    return final_values

rd_config = {
    'AA': { 'min': 0, 'max': 1 },
    'AW': { 'min': 0, 'max': 1 },
    'AX': { 'min': 0, 'max': 10 },
    'CM': { 'min': 0, 'max': 5 },
    'CP': { 'min': 0, 'max': 1 },
    'FR': { 'min': 0, 'max': 20 },
    'GF': { 'min': 0, 'max': 51 },
    'MF': { 'min': 0, 'max': 20 },
    'PR': { 'min': 30, 'max': 90 },
    'SS': { 'min': 0, 'max': 10 },
}

max_sum = 100

In [None]:
# CSV 檔位置
csv_path = '/workspaces/BO_EXPERIMENTS/src/datasets/LIMS_automl_20260105_105926_MV260final_bound_fullMerged_median_clean_10_sum100_no_rare_GF_20most_only.csv'

# LASSO 模型位置
model_info_path = '/workspaces/BO_EXPERIMENTS/src/results/20260204/PBTc/model/lasso.pkl'

# 生成資料儲存位置
generate_data_dir = '/workspaces/BO_EXPERIMENTS/src/datasets/mt_lasso_dataset/interactive_term/'
generate_data_path = os.path.join(generate_data_dir, 'data.pkl')
os.makedirs(generate_data_dir, exist_ok=True)

In [4]:
# 讀取 ground true dataset，用這個 dataset 的 column name 生成資料
csv_path = '/workspaces/BO_EXPERIMENTS/src/datasets/LIMS_automl_20260105_105926_MV260final_bound_fullMerged_median_clean_10_sum100_no_rare_GF_20most_only.csv'

gt_data = pd.read_csv(csv_path)
gt_data.dropna(inplace=True)
gt_data.reset_index(inplace=True, drop=True)

# 物性欄位與資料
prop_cols = ['MI', 'MV', 'SPGR', 'ASH', 'TS', 'TE', 'TM', 'FS', 'FM', 'IS']
prop_data = gt_data[prop_cols]
target_cols = ['SPGR', 'TE']
target_data = prop_data[target_cols]

# 配方欄位
formula_cols = list(set(gt_data.columns) - set(prop_cols))
formula_cols.sort()

# 配方類別
formula_class_ls = list(set([c[:2] for c in formula_cols]))
formula_class_ls.sort()

# 配方類別設定
formula_cfg = { c:rd_config[c] for c in formula_class_ls}

In [5]:
# 讀取 model_info
model_info = joblib.load(model_info_path)

In [6]:
# 測試生成配方類別各類的數值
key_res = generate_key_data(formula_cfg, target_sum=max_sum)

data_sum = 0
for k, v in key_res.items():
    data_sum += v
    print(f"{k:<5} | {v:>8.1f}")

print('sum', data_sum)

# 測試按照上述的各類別數值生成子欄位的數字
col_res = generate_sub_col_data(key_res, formula_cols)
print(col_res)
print('sum', sum([v for k, v in col_res.items()]))

AA    |      1.0
AW    |      0.8
AX    |      0.1
CM    |      0.2
GF    |     20.1
MF    |      9.8
PR    |     64.4
SS    |      3.6
sum 100.0
{'AA004': 0.8, 'AA006': 0.2, 'AW001': 0.4, 'AW003': 0.2, 'AW004': 0.2, 'AW005': 0.0, 'AX020': 0.1, 'CM1002': 0.0, 'CM1007': 0.2, 'GF014': 2.1, 'GF016': 17.2, 'GF020': 0.8, 'MF001': 9.8, 'PR007': 17.2, 'PR009': 5.9, 'PR020': 5.4, 'PR022': 7.2, 'PR024': 28.7, 'SS004': 2.9, 'SS010': 0.7}
sum 100.0


In [7]:
# 生成 1000 筆資料
formula_samples = []
for i in range(1000):
    key_res = generate_key_data(formula_cfg, target_sum=max_sum)
    col_res = generate_sub_col_data(key_res, formula_cols)
    formula_samples.append(col_res)

formula_samples = pd.DataFrame(formula_samples)

In [8]:
target_samples = model_info['model_info']['PIPE'].predict(formula_samples)
target_samples = pd.DataFrame(target_samples, columns=target_cols)

In [9]:
# 真實資料描述
print(target_data.describe().round(3))
# 生成資料描述
print(target_samples.describe().round(3))

          SPGR       TE
count  127.000  127.000
mean     1.491    2.804
std      0.085    0.470
min      1.352    1.364
25%      1.443    2.547
50%      1.454    2.751
75%      1.542    3.042
max      1.748    4.024
           SPGR        TE
count  1000.000  1000.000
mean      1.353     3.944
std       0.108     0.999
min       1.001     1.124
25%       1.291     3.249
50%       1.347     3.761
75%       1.410     4.560
max       1.763     7.842


In [None]:
# 儲存生成的資料
# 儲存資料
save_data = {
    'initial_data': {
        'X': np.array(formula_samples),
        'Y': np.array(target_samples),
    },
    # 'ground_truth': {
    #     'X': x_star,
    #     'Y': y_star
    # },
    'oracle_model': model_info
}

joblib.dump(save_data, generate_data_path)