In [1]:
import optuna
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# 先前的代碼準備
# 這裡假設您已經完成了數據加載和預處理等操作

# 定義目標函數
def objective(trial):
    # 定義超參數搜索空間
    max_iter = trial.suggest_int("max_iter", 50, 300)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 10, 100)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    
    # 初始化模型
    model = HistGradientBoostingClassifier(
        max_iter=max_iter,
        learning_rate=learning_rate,
        max_leaf_nodes=max_leaf_nodes,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    # 使用多標籤分類器封裝模型
    multi_target_model = MultiOutputClassifier(model, n_jobs=-1)
    
    # 訓練模型
    multi_target_model.fit(X_train, y_train)
    
    # 驗證集評估
    y_val_pred = multi_target_model.predict(X_val)
    score = f1_score(y_val, y_val_pred, average='micro')  # 使用微平均計算F1分數
    
    return score

# 創建Optuna的研究對象，並運行優化
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# 顯示最佳超參數
print("Best hyperparameters:", study.best_params)
print("Best micro F1 score:", study.best_value)

# 使用最佳超參數訓練和測試模型
best_params = study.best_params
best_model = HistGradientBoostingClassifier(
    max_iter=best_params["max_iter"],
    learning_rate=best_params["learning_rate"],
    max_leaf_nodes=best_params["max_leaf_nodes"],
    min_samples_leaf=best_params["min_samples_leaf"],
    random_state=42
)
best_multi_target_model = MultiOutputClassifier(best_model, n_jobs=-1)
best_multi_target_model.fit(X_train, y_train)

# 驗證集和測試集的分類報告
y_val_pred = best_multi_target_model.predict(X_val)
y_test_pred = best_multi_target_model.predict(X_test)

val_report = classification_report(y_val, y_val_pred, zero_division=0)
test_report = classification_report(y_test, y_test_pred, zero_division=0)

# 儲存結果
output_path = "Optimized_HistGradientBoosting_Results.txt"
with open(output_path, "w") as f:
    f.write("Validation Classification Report:\n")
    f.write(val_report)
    f.write("\n\nTest Classification Report:\n")
    f.write(test_report)

print(f"Results saved to {output_path}")


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-11-18 18:14:17,892] A new study created in memory with name: no-name-09adcfb3-baa6-4c02-9242-6434454dff79
[W 2024-11-18 18:14:17,900] Trial 0 failed with parameters: {'max_iter': 230, 'learning_rate': 0.20395761516023325, 'max_leaf_nodes': 20, 'min_samples_leaf': 16} because of the following error: NameError("name 'X_train' is not defined").
Traceback (most recent call last):
  File "C:\Users\USER\anaconda3\envs\pytorch\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_37600\3190880876.py", line 31, in objective
    multi_target_model.fit(X_train, y_train)
NameError: name 'X_train' is not defined
[W 2024-11-18 18:14:17,902] Trial 0 failed with value None.


NameError: name 'X_train' is not defined