In [15]:
import sys
import os
import torch
import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd

import xgboost as xgb
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split, GridSearchCV
from pprint import pprint
from typing import Optional
from contextlib import redirect_stdout
import joblib

In [2]:
# 讀取資料
datapath = '/workspaces/BO_EXPERIMENTS/src/datasets/LIMS_automl_20260105_105926_MV260final_bound_fullMerged_median_clean_10_sum100_no_rare_GF_20most_only.csv'
data = pd.read_csv(datapath)
data.dropna(inplace=True)
data.reset_index(inplace=True, drop=True)

# 物性欄位
prop_cols = ['MI', 'MV', 'SPGR', 'ASH', 'TS', 'TE', 'TM', 'FS', 'FM', 'IS']
prop_data = data[prop_cols]

# 監控欄位
X = data.drop(columns=prop_cols)

# 需要被建模的物性欄位
target_cols = ['SPGR', 'TE' ]
Y = data[target_cols]

# 區分訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [3]:
# 先用 CV 找出最佳參數
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'n_estimators': [50, 60, 70, 80, 90, 100]
}
xgb_model = xgb.XGBRegressor(
    tree_method='hist', 
    device='cpu',               # 指定使用哪一張顯卡（通常是 0）
    predictor='gpu_predictor', # 預測也使用 GPU 加速
    multi_strategy='multi_output_tree'
)
grid_search = GridSearchCV(
    estimator=xgb_model, 
    param_grid=param_grid, 
    cv=3, 
    scoring='neg_mean_squared_error',
    verbose=1 # 顯示進度
)

grid_search.fit(X_train, y_train)
print(f"最佳參數: {grid_search.best_params_}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits


Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.



最佳參數: {'max_depth': 3, 'n_estimators': 90}


Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [6]:
grid_search.best_params_

{'max_depth': 3, 'n_estimators': 90}

In [None]:
# 2. 配置模型
# 關鍵：tree_method 建議用 'hist' 或 'gpu_hist'，這對多任務處理效率更高
model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=grid_search.best_params_['n_estimators'],
    learning_rate=0.1,
    max_depth=grid_search.best_params_['max_depth'],
    tree_method='hist',  # 或者 'gpu_hist' 如果你有 GPU
    multi_strategy='multi_output_tree' # 這是核心參數！
)

# 3. 訓練模型
model.fit(X_train, y_train)

# 4. 預測
preds_test = model.predict(X_test)
preds_train = model.predict(X_train)

# print(f"預測結果形狀: {preds_test.shape}") # 應為 (200, 3)

預測結果形狀: (26, 2)


In [9]:
train_mse = np.mean(np.square(y_train - preds_train), axis=0)
test_mse = np.mean(np.square(y_test - preds_test), axis=0)
r2 = 1 - train_mse

In [19]:
r2_list = r2_score(y_train, preds_train, multioutput='raw_values')
r2_list

array([0.96111727, 0.9843684 ], dtype=float32)

In [20]:
r2_test_list = r2_score(y_test, preds_test, multioutput='raw_values')
r2_test_list

array([0.8381574 , 0.78624386], dtype=float32)

In [29]:
print("Train RMSE:")
print(np.sqrt(train_mse).round(3))
print("Test RMSE:")
print(np.sqrt(test_mse).round(3))

Train RMSE:
SPGR    0.017
TE      0.061
dtype: float64
Test RMSE:
SPGR    0.026
TE      0.172
dtype: float64


In [27]:
y_test.describe().round(3)

Unnamed: 0,SPGR,TE
count,26.0,26.0
mean,1.463,2.895
std,0.065,0.379
min,1.358,2.346
25%,1.424,2.663
50%,1.45,2.78
75%,1.523,3.058
max,1.556,3.806
