In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

optunaで最適なパラメータを探す

In [4]:
import optuna
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# データの読み込み
file_path = '/content/drive/My Drive/hotel_service/data/train_0110_6.csv'
train = pd.read_csv(file_path, low_memory=False)

# 特徴量と目的変数の分割
X = train.drop(columns=['y']).copy()
y = train['y'].copy()

# ✅ カテゴリ変数のリスト作成とcategory型変換
categorical_features = list(X.select_dtypes(include='object').columns)
for col in categorical_features:
    X[col] = X[col].astype('category')

# ✅ OptunaのObjective関数（XGBoost用）
def objective(trial):
    # ハイパーパラメータの探索範囲を指定
    params = {
        'objective': 'reg:squarederror',  # 回帰タスク
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'eta': trial.suggest_loguniform('eta', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'random_state': 42
    }

    # ✅ クロスバリデーション設定
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []

    # ✅ クロスバリデーションループ
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # ✅ XGBoost用データセットの作成
        train_data = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
        val_data = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)

        # ✅ モデルのトレーニング
        model = xgb.train(
            params,
            train_data,
            evals=[(val_data, "validation")],
            early_stopping_rounds=50,
            verbose_eval=False
        )

        # ✅ 予測とRMSE計算
        y_pred = model.predict(val_data)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)

    # ✅ 平均RMSEを返す
    return np.mean(rmse_scores)

# ✅ Optunaの最適化開始
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# ✅ 最適なハイパーパラメータとRMSEの表示
print("\nBest Hyperparameters:", study.best_params)
print(f"Best RMSE: {study.best_value:.4f}")


[I 2025-01-11 05:19:59,832] A new study created in memory with name: no-name-8d4f1f19-9b6b-4867-8332-251557b3f529
  'eta': trial.suggest_loguniform('eta', 0.01, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 1.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 1.0),
Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

[I 2025-01-11 05:20:07,758] Trial 0 finished with value: 129.0539204434146 and parameters: {'eta': 0.06738136690585612, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.6247108889430784, 'colsample_bytree': 0.7343267979310146, 'lambda': 0.010721900974145928, 'alpha': 0.3014029107175026, 'n_estimators': 304}. Best is trial 0 with value: 129.0539


Best Hyperparameters: {'eta': 0.09803579704913938, 'max_depth': 11, 'min_child_weight': 9, 'subsample': 0.8423303174560884, 'colsample_bytree': 0.6050735043852118, 'lambda': 0.1615672823944763, 'alpha': 0.10375080105205817, 'n_estimators': 377}
Best RMSE: 121.0173
