- ageグループの切り方を[0, 5, 13, 18, 60, 100]に変えた
- 出費をtwoとthreeのみ使い、特徴量を最小限に

In [16]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import catboost as cb

import optuna
from sklearn.model_selection import StratifiedGroupKFold
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

# Load datasets
train = pd.read_csv("../dataset/train.csv")
test = pd.read_csv("../dataset/test.csv")

def create_features(df: pd.DataFrame) -> pd.DataFrame:
    """欠損値を補完せずに特徴量を生成する"""
    data = df.copy()

    # Cabin情報を分解（欠損はNaNのまま保持）
    if 'Cabin' in data.columns:
        cabin_split = data['Cabin'].str.split('/', expand=True)
        data['Deck'] = cabin_split[0]
        data['Num'] = pd.to_numeric(cabin_split[1], errors='coerce')  # NaNそのまま
        data['Side'] = cabin_split[2]
    else:
        data['Deck'] = np.nan
        data['Num'] = np.nan
        data['Side'] = np.nan

    # 支出関連特徴量（NaNはそのまま残す）
    expense_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    two_expense = ['FoodCourt', 'ShoppingMall']
    three_expense = ['RoomService', 'Spa', 'VRDeck']
    for col in expense_cols:
        if col not in data.columns:
            data[col] = np.nan

    # 総支出額（NaNのまま計算 → NaNを含む場合はNaN）
    data['TotalExpense'] = data[expense_cols].sum(axis=1, skipna=False)
    data['TwoExpense'] = data[two_expense].sum(axis=1, skipna=False)
    data['ThreeExpense'] = data[three_expense].sum(axis=1, skipna=False)
    data['HasExpense'] = data[expense_cols].gt(0).any(axis=1).astype(int)


    # 年齢グループ（NaNは'Unknown'にせず、NaNのまま）
    bins = [0, 5, 13, 18, 60, 100]
    labels = ['Child', 'Teen', 'Young', 'Adult', 'Senior']
    data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)

    # Name情報から家族サイズを生成（欠損はNaN）
    if 'Name' in data.columns:
        data['FamilyName'] = data['Name'].str.split().str[-1]
        family_counts = data['FamilyName'].value_counts()
        data['FamilySize'] = data['FamilyName'].map(family_counts)
        data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    else:
        data['FamilyName'] = np.nan
        data['FamilySize'] = np.nan
        data['IsAlone'] = np.nan

    # PassengerId情報（欠損時はNaN）
    if 'PassengerId' in data.columns:
        passenger_split = data['PassengerId'].str.split('_', expand=True)
        data['PassengerGroup'] = passenger_split[0]
        data['PassengerNum'] = pd.to_numeric(passenger_split[1], errors='coerce')
    else:
        data['PassengerGroup'] = np.nan
        data['PassengerNum'] = np.nan

    return data



# =========================================================
# 2. 特徴量生成
# =========================================================
train_features = create_features(train)
test_features = create_features(test)

feature_cols = [
    'HomePlanet', 'CryoSleep', 'Destination',  'VIP',
    'Side', 'Num',
    'TwoExpense', 'ThreeExpense', 'AgeGroup'
]
# 'Age', 'Deck', 'FamilySize', 'IsAlone', 'HasExpense','TotalExpense', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',

X_full = train_features[feature_cols].copy()
X_test_full = test_features[feature_cols].copy()
y_full = train['Transported']

# カテゴリ列をcategory型に変換（NaNも保持）
categorical_features = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP',
    'Side', 'AgeGroup'
]

for col in categorical_features:
    X_full[col] = X_full[col].astype('category')
    X_test_full[col] = X_test_full[col].astype('category')

print(f"使用特徴量数: {len(feature_cols)}")
print(f"特徴量一覧: {feature_cols}")
print(f"データシェイプ: Train {X_full.shape}, Test {X_test_full.shape}")

使用特徴量数: 9
特徴量一覧: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side', 'Num', 'TwoExpense', 'ThreeExpense', 'AgeGroup']
データシェイプ: Train (8693, 9), Test (4277, 9)


## lightgbm max leaves4許容

In [5]:
def objective(trial: optuna.Trial) -> float:
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_estimators': 600,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 4),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 1.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1.0, log=True),
        'random_state': 42,
        'metric': 'None',  # metricはfitで与える
    }

    # --- 重要: 同じPassengerGroupの人を同じsplitに入れる ---
    # create_featuresで作った PassengerGroup列を使う
    groups = train_features['PassengerGroup'].astype(str)

    cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    for fold_idx, (trn_idx, val_idx) in enumerate(cv.split(X_full, y_full, groups=groups), start=1):
        X_trn, X_val = X_full.iloc[trn_idx], X_full.iloc[val_idx]
        y_trn, y_val = y_full.iloc[trn_idx], y_full.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)
        callbacks = [
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0),
        ]

        model.fit(
            X_trn, y_trn,
            eval_set=[(X_val, y_val)],
            eval_metric='binary_logloss',  # LightGBM標準指標
            categorical_feature=categorical_features,
            callbacks=callbacks,
        )

        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        fold_scores.append(acc)

        trial.report(float(np.mean(fold_scores)), step=fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(fold_scores))

# --- Optuna設定 ---
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=2)
study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("\n=== Optuna 結果 ===")
print(f"Best CV accuracy: {study.best_value:.5f}")
for k, v in study.best_params.items():
    print(f"{k}: {v}")

# --- 最終モデル学習 ---
best_params = study.best_params | {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'n_estimators': 600,
    'metric': 'None',
    'random_state': 42,
}
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_full, y_full, categorical_feature=categorical_features)
print("\n最終モデル学習完了")


[I 2025-10-20 00:13:24,735] A new study created in memory with name: no-name-fede4fcd-3e18-4b40-ab0f-d06837e022af
Best trial: 0. Best value: 0.807878:   2%|▎         | 1/40 [00:03<02:35,  3.98s/it]

[I 2025-10-20 00:13:28,727] Trial 0 finished with value: 0.80787830440427 and parameters: {'learning_rate': 0.02757359293934948, 'num_leaves': 49, 'max_depth': 4, 'min_data_in_leaf': 64, 'feature_fraction': 0.7468055921327309, 'bagging_fraction': 0.7467983561008608, 'bagging_freq': 1, 'lambda_l1': 0.39676050770529864, 'lambda_l2': 0.06358358856676251}. Best is trial 0 with value: 0.80787830440427.


Best trial: 1. Best value: 0.809038:   5%|▌         | 2/40 [00:06<01:50,  2.91s/it]

[I 2025-10-20 00:13:30,891] Trial 1 finished with value: 0.8090382785332931 and parameters: {'learning_rate': 0.06803900745073703, 'num_leaves': 15, 'max_depth': 4, 'min_data_in_leaf': 85, 'feature_fraction': 0.7637017332034828, 'bagging_fraction': 0.7545474901621302, 'bagging_freq': 1, 'lambda_l1': 0.008179499475211672, 'lambda_l2': 0.03752055855124281}. Best is trial 1 with value: 0.8090382785332931.


Best trial: 1. Best value: 0.809038:   8%|▊         | 3/40 [00:09<02:00,  3.25s/it]

[I 2025-10-20 00:13:34,540] Trial 2 finished with value: 0.8081097981734257 and parameters: {'learning_rate': 0.032211189352044464, 'num_leaves': 25, 'max_depth': 4, 'min_data_in_leaf': 22, 'feature_fraction': 0.7876433945605654, 'bagging_fraction': 0.8099085529881075, 'bagging_freq': 3, 'lambda_l1': 0.22673986523780396, 'lambda_l2': 0.003972110727381913}. Best is trial 1 with value: 0.8090382785332931.


Best trial: 1. Best value: 0.809038:  10%|█         | 4/40 [00:12<01:47,  2.99s/it]

[I 2025-10-20 00:13:37,145] Trial 3 finished with value: 0.8065192637540657 and parameters: {'learning_rate': 0.04025192252635066, 'num_leaves': 36, 'max_depth': 3, 'min_data_in_leaf': 65, 'feature_fraction': 0.7511572371061874, 'bagging_fraction': 0.7195154778955838, 'bagging_freq': 5, 'lambda_l1': 0.788671412999049, 'lambda_l2': 0.26619018884890555}. Best is trial 1 with value: 0.8090382785332931.


Best trial: 1. Best value: 0.809038:  12%|█▎        | 5/40 [00:17<02:06,  3.62s/it]

[I 2025-10-20 00:13:41,888] Trial 4 finished with value: 0.8082131604642179 and parameters: {'learning_rate': 0.022816739880816207, 'num_leaves': 18, 'max_depth': 4, 'min_data_in_leaf': 50, 'feature_fraction': 0.7366114704534336, 'bagging_fraction': 0.848553073033381, 'bagging_freq': 1, 'lambda_l1': 0.5345166110646816, 'lambda_l2': 0.005975027999960293}. Best is trial 1 with value: 0.8090382785332931.


Best trial: 5. Best value: 0.809236:  15%|█▌        | 6/40 [00:19<01:50,  3.26s/it]

[I 2025-10-20 00:13:44,432] Trial 5 finished with value: 0.8092363987886723 and parameters: {'learning_rate': 0.06014321882783979, 'num_leaves': 26, 'max_depth': 4, 'min_data_in_leaf': 59, 'feature_fraction': 0.7554563366576581, 'bagging_fraction': 0.9908753883293676, 'bagging_freq': 4, 'lambda_l1': 0.6584106160121611, 'lambda_l2': 0.4835952776465949}. Best is trial 5 with value: 0.8092363987886723.


Best trial: 5. Best value: 0.809236:  18%|█▊        | 7/40 [00:20<01:26,  2.61s/it]

[I 2025-10-20 00:13:45,702] Trial 6 pruned. 


Best trial: 5. Best value: 0.809236:  20%|██        | 8/40 [00:22<01:12,  2.28s/it]

[I 2025-10-20 00:13:47,269] Trial 7 pruned. 


Best trial: 5. Best value: 0.809236:  22%|██▎       | 9/40 [00:24<01:11,  2.30s/it]

[I 2025-10-20 00:13:49,600] Trial 8 pruned. 


Best trial: 5. Best value: 0.809236:  25%|██▌       | 10/40 [00:25<00:57,  1.92s/it]

[I 2025-10-20 00:13:50,689] Trial 9 pruned. 


Best trial: 5. Best value: 0.809236:  28%|██▊       | 11/40 [00:27<00:48,  1.69s/it]

[I 2025-10-20 00:13:51,844] Trial 10 pruned. 


Best trial: 5. Best value: 0.809236:  30%|███       | 12/40 [00:27<00:39,  1.43s/it]

[I 2025-10-20 00:13:52,680] Trial 11 pruned. 


Best trial: 5. Best value: 0.809236:  32%|███▎      | 13/40 [00:28<00:35,  1.32s/it]

[I 2025-10-20 00:13:53,737] Trial 12 pruned. 


Best trial: 5. Best value: 0.809236:  35%|███▌      | 14/40 [00:29<00:31,  1.22s/it]

[I 2025-10-20 00:13:54,728] Trial 13 pruned. 


Best trial: 5. Best value: 0.809236:  38%|███▊      | 15/40 [00:31<00:29,  1.18s/it]

[I 2025-10-20 00:13:55,806] Trial 14 pruned. 


Best trial: 5. Best value: 0.809236:  40%|████      | 16/40 [00:32<00:27,  1.15s/it]

[I 2025-10-20 00:13:56,909] Trial 15 pruned. 


Best trial: 5. Best value: 0.809236:  42%|████▎     | 17/40 [00:34<00:33,  1.47s/it]

[I 2025-10-20 00:13:59,116] Trial 16 pruned. 


Best trial: 5. Best value: 0.809236:  45%|████▌     | 18/40 [00:36<00:33,  1.53s/it]

[I 2025-10-20 00:14:00,797] Trial 17 pruned. 


Best trial: 5. Best value: 0.809236:  48%|████▊     | 19/40 [00:37<00:29,  1.40s/it]

[I 2025-10-20 00:14:01,883] Trial 18 pruned. 


Best trial: 5. Best value: 0.809236:  50%|█████     | 20/40 [00:38<00:28,  1.41s/it]

[I 2025-10-20 00:14:03,322] Trial 19 pruned. 


Best trial: 5. Best value: 0.809236:  52%|█████▎    | 21/40 [00:39<00:26,  1.40s/it]

[I 2025-10-20 00:14:04,686] Trial 20 pruned. 


Best trial: 21. Best value: 0.809495:  55%|█████▌    | 22/40 [00:44<00:42,  2.38s/it]

[I 2025-10-20 00:14:09,360] Trial 21 finished with value: 0.8094949387865047 and parameters: {'learning_rate': 0.019623740136861162, 'num_leaves': 18, 'max_depth': 4, 'min_data_in_leaf': 49, 'feature_fraction': 0.729788830072314, 'bagging_fraction': 0.8353759643826032, 'bagging_freq': 1, 'lambda_l1': 0.7631876796185896, 'lambda_l2': 0.006601806814693494}. Best is trial 21 with value: 0.8094949387865047.


Best trial: 21. Best value: 0.809495:  57%|█████▊    | 23/40 [00:46<00:39,  2.31s/it]

[I 2025-10-20 00:14:11,497] Trial 22 pruned. 


Best trial: 21. Best value: 0.809495:  60%|██████    | 24/40 [00:48<00:35,  2.23s/it]

[I 2025-10-20 00:14:13,553] Trial 23 finished with value: 0.8093809636900637 and parameters: {'learning_rate': 0.07155478477152658, 'num_leaves': 15, 'max_depth': 4, 'min_data_in_leaf': 42, 'feature_fraction': 0.7696954637466523, 'bagging_fraction': 0.8771576274499236, 'bagging_freq': 1, 'lambda_l1': 0.1162408433176781, 'lambda_l2': 0.00216939008477882}. Best is trial 21 with value: 0.8094949387865047.


Best trial: 21. Best value: 0.809495:  62%|██████▎   | 25/40 [00:50<00:32,  2.15s/it]

[I 2025-10-20 00:14:15,525] Trial 24 pruned. 


Best trial: 21. Best value: 0.809495:  65%|██████▌   | 26/40 [00:51<00:24,  1.77s/it]

[I 2025-10-20 00:14:16,403] Trial 25 pruned. 


Best trial: 21. Best value: 0.809495:  68%|██████▊   | 27/40 [00:52<00:21,  1.62s/it]

[I 2025-10-20 00:14:17,666] Trial 26 pruned. 


Best trial: 21. Best value: 0.809495:  70%|███████   | 28/40 [00:54<00:18,  1.53s/it]

[I 2025-10-20 00:14:18,987] Trial 27 pruned. 


Best trial: 21. Best value: 0.809495:  72%|███████▎  | 29/40 [00:58<00:27,  2.49s/it]

[I 2025-10-20 00:14:23,729] Trial 28 finished with value: 0.808485873022202 and parameters: {'learning_rate': 0.02047098272373616, 'num_leaves': 28, 'max_depth': 4, 'min_data_in_leaf': 48, 'feature_fraction': 0.8087273091842327, 'bagging_fraction': 0.8417850730406845, 'bagging_freq': 2, 'lambda_l1': 0.10929776949121697, 'lambda_l2': 0.007720432359501497}. Best is trial 21 with value: 0.8094949387865047.


Best trial: 21. Best value: 0.809495:  75%|███████▌  | 30/40 [01:00<00:21,  2.18s/it]

[I 2025-10-20 00:14:25,193] Trial 29 pruned. 


Best trial: 21. Best value: 0.809495:  78%|███████▊  | 31/40 [01:03<00:22,  2.47s/it]

[I 2025-10-20 00:14:28,340] Trial 30 pruned. 


Best trial: 21. Best value: 0.809495:  80%|████████  | 32/40 [01:04<00:16,  2.05s/it]

[I 2025-10-20 00:14:29,394] Trial 31 pruned. 


Best trial: 21. Best value: 0.809495:  82%|████████▎ | 33/40 [01:05<00:12,  1.75s/it]

[I 2025-10-20 00:14:30,460] Trial 32 pruned. 


Best trial: 33. Best value: 0.809873:  85%|████████▌ | 34/40 [01:08<00:12,  2.14s/it]

[I 2025-10-20 00:14:33,487] Trial 33 finished with value: 0.8098729148973168 and parameters: {'learning_rate': 0.03640470860180394, 'num_leaves': 24, 'max_depth': 4, 'min_data_in_leaf': 58, 'feature_fraction': 0.7531453327915494, 'bagging_fraction': 0.8011825624412329, 'bagging_freq': 1, 'lambda_l1': 0.9963418180924837, 'lambda_l2': 0.17245269987261233}. Best is trial 33 with value: 0.8098729148973168.


Best trial: 33. Best value: 0.809873:  88%|████████▊ | 35/40 [01:10<00:09,  1.94s/it]

[I 2025-10-20 00:14:34,978] Trial 34 pruned. 


Best trial: 33. Best value: 0.809873:  90%|█████████ | 36/40 [01:12<00:07,  1.97s/it]

[I 2025-10-20 00:14:37,020] Trial 35 pruned. 


Best trial: 33. Best value: 0.809873:  92%|█████████▎| 37/40 [01:16<00:07,  2.54s/it]

[I 2025-10-20 00:14:40,896] Trial 36 finished with value: 0.8084682225499524 and parameters: {'learning_rate': 0.03060513765670124, 'num_leaves': 34, 'max_depth': 4, 'min_data_in_leaf': 69, 'feature_fraction': 0.7575814276931923, 'bagging_fraction': 0.7782464390224983, 'bagging_freq': 1, 'lambda_l1': 0.9572744118680322, 'lambda_l2': 0.5354983235418009}. Best is trial 33 with value: 0.8098729148973168.


Best trial: 33. Best value: 0.809873:  95%|█████████▌| 38/40 [01:17<00:04,  2.14s/it]

[I 2025-10-20 00:14:42,104] Trial 37 pruned. 


Best trial: 33. Best value: 0.809873:  98%|█████████▊| 39/40 [01:18<00:01,  1.97s/it]

[I 2025-10-20 00:14:43,656] Trial 38 pruned. 


Best trial: 33. Best value: 0.809873: 100%|██████████| 40/40 [01:20<00:00,  2.01s/it]


[I 2025-10-20 00:14:44,982] Trial 39 pruned. 

=== Optuna 結果 ===
Best CV accuracy: 0.80987
learning_rate: 0.03640470860180394
num_leaves: 24
max_depth: 4
min_data_in_leaf: 58
feature_fraction: 0.7531453327915494
bagging_fraction: 0.8011825624412329
bagging_freq: 1
lambda_l1: 0.9963418180924837
lambda_l2: 0.17245269987261233

最終モデル学習完了


## lightgbm max leaves3

In [17]:
def objective(trial: optuna.Trial) -> float:
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_estimators': 600,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 50),
        'max_depth': 3,
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 1.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1.0, log=True),
        'random_state': 42,
        'metric': 'None',  # metricはfitで与える
    }

    # --- 重要: 同じPassengerGroupの人を同じsplitに入れる ---
    # create_featuresで作った PassengerGroup列を使う
    groups = train_features['PassengerGroup'].astype(str)

    cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    for fold_idx, (trn_idx, val_idx) in enumerate(cv.split(X_full, y_full, groups=groups), start=1):
        X_trn, X_val = X_full.iloc[trn_idx], X_full.iloc[val_idx]
        y_trn, y_val = y_full.iloc[trn_idx], y_full.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)
        callbacks = [
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0),
        ]

        model.fit(
            X_trn, y_trn,
            eval_set=[(X_val, y_val)],
            eval_metric='binary_logloss',  # LightGBM標準指標
            categorical_feature=categorical_features,
            callbacks=callbacks,
        )

        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        fold_scores.append(acc)

        trial.report(float(np.mean(fold_scores)), step=fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(fold_scores))

# --- Optuna設定 ---
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=2)
study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("\n=== Optuna 結果 ===")
print(f"Best CV accuracy: {study.best_value:.5f}")
for k, v in study.best_params.items():
    print(f"{k}: {v}")

# --- 最終モデル学習 ---
best_params = study.best_params | {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'n_estimators': 600,
    'metric': 'None',
    'random_state': 42,
}
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_full, y_full, categorical_feature=categorical_features)
print("\n最終モデル学習完了")


[I 2025-10-20 00:38:56,349] A new study created in memory with name: no-name-2181eb6b-5b2b-43eb-a058-7f4802638844
Best trial: 0. Best value: 0.807411:   2%|▎         | 1/40 [00:03<02:12,  3.39s/it]

[I 2025-10-20 00:38:59,736] Trial 0 finished with value: 0.8074111049237243 and parameters: {'learning_rate': 0.02757359293934948, 'num_leaves': 49, 'min_data_in_leaf': 76, 'feature_fraction': 0.8795975452591109, 'bagging_fraction': 0.7468055921327309, 'bagging_freq': 1, 'lambda_l1': 0.001493656855461764, 'lambda_l2': 0.39676050770529864}. Best is trial 0 with value: 0.8074111049237243.


Best trial: 1. Best value: 0.80901:   5%|▌         | 2/40 [00:05<01:45,  2.78s/it] 

[I 2025-10-20 00:39:02,088] Trial 1 finished with value: 0.8090099353175532 and parameters: {'learning_rate': 0.05092911283433821, 'num_leaves': 40, 'min_data_in_leaf': 11, 'feature_fraction': 0.9909729556485983, 'bagging_fraction': 0.9497327922401265, 'bagging_freq': 2, 'lambda_l1': 0.0035113563139704067, 'lambda_l2': 0.0035498788321965025}. Best is trial 1 with value: 0.8090099353175532.


Best trial: 1. Best value: 0.80901:   8%|▊         | 3/40 [00:09<01:53,  3.07s/it]

[I 2025-10-20 00:39:05,500] Trial 2 finished with value: 0.8078956471996698 and parameters: {'learning_rate': 0.02279379523765072, 'num_leaves': 33, 'min_data_in_leaf': 49, 'feature_fraction': 0.7873687420594125, 'bagging_fraction': 0.8835558684167139, 'bagging_freq': 1, 'lambda_l1': 0.007523742884534853, 'lambda_l2': 0.01256277350380703}. Best is trial 1 with value: 0.8090099353175532.


Best trial: 1. Best value: 0.80901:  10%|█         | 4/40 [00:12<01:52,  3.12s/it]

[I 2025-10-20 00:39:08,690] Trial 3 finished with value: 0.8074077200451159 and parameters: {'learning_rate': 0.03438586247938296, 'num_leaves': 43, 'min_data_in_leaf': 28, 'feature_fraction': 0.8542703315240835, 'bagging_fraction': 0.8777243706586128, 'bagging_freq': 1, 'lambda_l1': 0.06647135865318027, 'lambda_l2': 0.0032476735706274485}. Best is trial 1 with value: 0.8090099353175532.


Best trial: 1. Best value: 0.80901:  12%|█▎        | 5/40 [00:15<01:52,  3.20s/it]

[I 2025-10-20 00:39:12,052] Trial 4 finished with value: 0.8005018705648096 and parameters: {'learning_rate': 0.011926324174062874, 'num_leaves': 49, 'min_data_in_leaf': 97, 'feature_fraction': 0.9425192044349383, 'bagging_fraction': 0.7913841307520112, 'bagging_freq': 1, 'lambda_l1': 0.11290133559092666, 'lambda_l2': 0.020914981329035603}. Best is trial 1 with value: 0.8090099353175532.


Best trial: 1. Best value: 0.80901:  15%|█▌        | 6/40 [00:17<01:31,  2.69s/it]

[I 2025-10-20 00:39:13,733] Trial 5 pruned. 


Best trial: 1. Best value: 0.80901:  18%|█▊        | 7/40 [00:20<01:30,  2.74s/it]

[I 2025-10-20 00:39:16,593] Trial 6 finished with value: 0.8080035047860239 and parameters: {'learning_rate': 0.04395225692486303, 'num_leaves': 21, 'min_data_in_leaf': 98, 'feature_fraction': 0.9325398470083344, 'bagging_fraction': 0.9818496824692567, 'bagging_freq': 5, 'lambda_l1': 0.06218704727769078, 'lambda_l2': 0.5829384542994738}. Best is trial 1 with value: 0.8090099353175532.


Best trial: 1. Best value: 0.80901:  20%|██        | 8/40 [00:21<01:16,  2.39s/it]

[I 2025-10-20 00:39:18,231] Trial 7 pruned. 


Best trial: 1. Best value: 0.80901:  22%|██▎       | 9/40 [00:23<01:06,  2.14s/it]

[I 2025-10-20 00:39:19,809] Trial 8 pruned. 


Best trial: 1. Best value: 0.80901:  25%|██▌       | 10/40 [00:25<00:59,  1.97s/it]

[I 2025-10-20 00:39:21,396] Trial 9 pruned. 


Best trial: 1. Best value: 0.80901:  28%|██▊       | 11/40 [00:26<00:49,  1.72s/it]

[I 2025-10-20 00:39:22,542] Trial 10 pruned. 


Best trial: 1. Best value: 0.80901:  30%|███       | 12/40 [00:28<00:50,  1.79s/it]

[I 2025-10-20 00:39:24,513] Trial 11 finished with value: 0.8076541285498526 and parameters: {'learning_rate': 0.07169439620387569, 'num_leaves': 25, 'min_data_in_leaf': 100, 'feature_fraction': 0.983742915992114, 'bagging_fraction': 0.9909880403443551, 'bagging_freq': 3, 'lambda_l1': 0.02790544734622926, 'lambda_l2': 0.6805035004000006}. Best is trial 1 with value: 0.8090099353175532.


Best trial: 1. Best value: 0.80901:  32%|███▎      | 13/40 [00:30<00:50,  1.87s/it]

[I 2025-10-20 00:39:26,544] Trial 12 pruned. 


Best trial: 1. Best value: 0.80901:  35%|███▌      | 14/40 [00:31<00:46,  1.80s/it]

[I 2025-10-20 00:39:28,179] Trial 13 pruned. 


Best trial: 1. Best value: 0.80901:  38%|███▊      | 15/40 [00:32<00:37,  1.50s/it]

[I 2025-10-20 00:39:29,003] Trial 14 pruned. 


Best trial: 1. Best value: 0.80901:  40%|████      | 16/40 [00:33<00:31,  1.32s/it]

[I 2025-10-20 00:39:29,883] Trial 15 pruned. 


Best trial: 1. Best value: 0.80901:  42%|████▎     | 17/40 [00:34<00:29,  1.27s/it]

[I 2025-10-20 00:39:31,054] Trial 16 pruned. 


Best trial: 1. Best value: 0.80901:  45%|████▌     | 18/40 [00:37<00:34,  1.59s/it]

[I 2025-10-20 00:39:33,377] Trial 17 finished with value: 0.8083709429295274 and parameters: {'learning_rate': 0.0710355431203536, 'num_leaves': 30, 'min_data_in_leaf': 86, 'feature_fraction': 0.897156986871926, 'bagging_fraction': 0.837799411767097, 'bagging_freq': 3, 'lambda_l1': 0.07251815117092195, 'lambda_l2': 0.8779154492395035}. Best is trial 1 with value: 0.8090099353175532.


Best trial: 1. Best value: 0.80901:  48%|████▊     | 19/40 [00:37<00:29,  1.39s/it]

[I 2025-10-20 00:39:34,312] Trial 18 pruned. 


Best trial: 1. Best value: 0.80901:  50%|█████     | 20/40 [00:39<00:27,  1.36s/it]

[I 2025-10-20 00:39:35,595] Trial 19 pruned. 


Best trial: 1. Best value: 0.80901:  52%|█████▎    | 21/40 [00:39<00:22,  1.17s/it]

[I 2025-10-20 00:39:36,328] Trial 20 pruned. 


Best trial: 1. Best value: 0.80901:  55%|█████▌    | 22/40 [00:43<00:31,  1.74s/it]

[I 2025-10-20 00:39:39,393] Trial 21 finished with value: 0.8082346034700818 and parameters: {'learning_rate': 0.038203479778360916, 'num_leaves': 19, 'min_data_in_leaf': 89, 'feature_fraction': 0.925509769095235, 'bagging_fraction': 0.9689908769227432, 'bagging_freq': 4, 'lambda_l1': 0.07065324147423908, 'lambda_l2': 0.8674871453360146}. Best is trial 1 with value: 0.8090099353175532.


Best trial: 1. Best value: 0.80901:  57%|█████▊    | 23/40 [00:44<00:27,  1.62s/it]

[I 2025-10-20 00:39:40,732] Trial 22 pruned. 


Best trial: 1. Best value: 0.80901:  60%|██████    | 24/40 [00:45<00:23,  1.49s/it]

[I 2025-10-20 00:39:41,909] Trial 23 pruned. 


Best trial: 1. Best value: 0.80901:  62%|██████▎   | 25/40 [00:47<00:22,  1.48s/it]

[I 2025-10-20 00:39:43,366] Trial 24 pruned. 


Best trial: 1. Best value: 0.80901:  65%|██████▌   | 26/40 [00:49<00:23,  1.65s/it]

[I 2025-10-20 00:39:45,422] Trial 25 finished with value: 0.8088040041605016 and parameters: {'learning_rate': 0.06256740315920926, 'num_leaves': 41, 'min_data_in_leaf': 90, 'feature_fraction': 0.8243775269784812, 'bagging_fraction': 0.9540283855827443, 'bagging_freq': 3, 'lambda_l1': 0.020386370351855046, 'lambda_l2': 0.9959567566960239}. Best is trial 1 with value: 0.8090099353175532.


Best trial: 1. Best value: 0.80901:  68%|██████▊   | 27/40 [00:49<00:18,  1.42s/it]

[I 2025-10-20 00:39:46,301] Trial 26 pruned. 


Best trial: 1. Best value: 0.80901:  70%|███████   | 28/40 [00:50<00:15,  1.30s/it]

[I 2025-10-20 00:39:47,318] Trial 27 pruned. 


Best trial: 1. Best value: 0.80901:  72%|███████▎  | 29/40 [00:51<00:12,  1.16s/it]

[I 2025-10-20 00:39:48,150] Trial 28 pruned. 


Best trial: 1. Best value: 0.80901:  75%|███████▌  | 30/40 [00:52<00:10,  1.09s/it]

[I 2025-10-20 00:39:49,072] Trial 29 pruned. 


Best trial: 1. Best value: 0.80901:  78%|███████▊  | 31/40 [00:53<00:09,  1.03s/it]

[I 2025-10-20 00:39:49,966] Trial 30 pruned. 


Best trial: 1. Best value: 0.80901:  80%|████████  | 32/40 [00:54<00:08,  1.08s/it]

[I 2025-10-20 00:39:51,178] Trial 31 pruned. 


Best trial: 1. Best value: 0.80901:  82%|████████▎ | 33/40 [00:56<00:08,  1.18s/it]

[I 2025-10-20 00:39:52,565] Trial 32 pruned. 


Best trial: 33. Best value: 0.810174:  85%|████████▌ | 34/40 [00:59<00:10,  1.83s/it]

[I 2025-10-20 00:39:55,908] Trial 33 finished with value: 0.8101740609061441 and parameters: {'learning_rate': 0.028130654811880963, 'num_leaves': 31, 'min_data_in_leaf': 49, 'feature_fraction': 0.9615959375102509, 'bagging_fraction': 0.9535733043385527, 'bagging_freq': 4, 'lambda_l1': 0.34646004336713787, 'lambda_l2': 0.5988142347842055}. Best is trial 33 with value: 0.8101740609061441.


Best trial: 33. Best value: 0.810174:  88%|████████▊ | 35/40 [01:01<00:09,  1.92s/it]

[I 2025-10-20 00:39:58,049] Trial 34 pruned. 


Best trial: 33. Best value: 0.810174:  90%|█████████ | 36/40 [01:05<00:09,  2.37s/it]

[I 2025-10-20 00:40:01,466] Trial 35 finished with value: 0.8082487511215725 and parameters: {'learning_rate': 0.02778915769014676, 'num_leaves': 28, 'min_data_in_leaf': 52, 'feature_fraction': 0.9780016592136974, 'bagging_fraction': 0.9462623599949622, 'bagging_freq': 3, 'lambda_l1': 0.006682763025058571, 'lambda_l2': 0.1917456152625}. Best is trial 33 with value: 0.8101740609061441.


Best trial: 33. Best value: 0.810174:  92%|█████████▎| 37/40 [01:06<00:06,  2.19s/it]

[I 2025-10-20 00:40:03,244] Trial 36 pruned. 


Best trial: 33. Best value: 0.810174:  95%|█████████▌| 38/40 [01:08<00:03,  1.97s/it]

[I 2025-10-20 00:40:04,699] Trial 37 pruned. 


Best trial: 33. Best value: 0.810174:  98%|█████████▊| 39/40 [01:09<00:01,  1.85s/it]

[I 2025-10-20 00:40:06,263] Trial 38 pruned. 


Best trial: 33. Best value: 0.810174: 100%|██████████| 40/40 [01:11<00:00,  1.79s/it]


[I 2025-10-20 00:40:08,044] Trial 39 pruned. 

=== Optuna 結果 ===
Best CV accuracy: 0.81017
learning_rate: 0.028130654811880963
num_leaves: 31
min_data_in_leaf: 49
feature_fraction: 0.9615959375102509
bagging_fraction: 0.9535733043385527
bagging_freq: 4
lambda_l1: 0.34646004336713787
lambda_l2: 0.5988142347842055

最終モデル学習完了


In [18]:
preds = final_model.predict(X_test_full)


passenger_ids = test["PassengerId"]

# DataFrame にまとめる
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": preds
})

# CSV 出力（インデックスは不要）
submission.to_csv("submission_lightgbm.csv", index=False)

submission['Transported'].value_counts()

Transported
True     2142
False    2135
Name: count, dtype: int64

## catboost

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import accuracy_score
import catboost as cb
import optuna
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

# =========================
# 1. データ読み込み
# =========================
train = pd.read_csv("../dataset/train.csv")
test = pd.read_csv("../dataset/test.csv")

def create_features(df: pd.DataFrame) -> pd.DataFrame:
    """欠損値を補完せずに特徴量を生成する"""
    data = df.copy()

    # Cabin情報を分解（欠損はNaNのまま保持）
    if 'Cabin' in data.columns:
        cabin_split = data['Cabin'].str.split('/', expand=True)
        data['Deck'] = cabin_split[0]
        data['Num'] = pd.to_numeric(cabin_split[1], errors='coerce')  # NaNそのまま
        data['Side'] = cabin_split[2]
    else:
        data['Deck'] = np.nan
        data['Num'] = np.nan
        data['Side'] = np.nan

    # 支出関連特徴量（NaNはそのまま残す）
    expense_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    two_expense = ['FoodCourt', 'ShoppingMall']
    three_expense = ['RoomService', 'Spa', 'VRDeck']
    for col in expense_cols:
        if col not in data.columns:
            data[col] = np.nan

    data['TotalExpense'] = data[expense_cols].sum(axis=1, skipna=False)
    data['TwoExpense'] = data[two_expense].sum(axis=1, skipna=False)
    data['ThreeExpense'] = data[three_expense].sum(axis=1, skipna=False)
    data['HasExpense'] = data[expense_cols].gt(0).any(axis=1).astype(int)

    # 年齢グループ（NaNはNaNのまま）
    bins = [0, 5, 13, 18, 60, 100]
    labels = ['Child', 'Teen', 'Young', 'Adult', 'Senior']
    data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)

    # Name→家族サイズ
    if 'Name' in data.columns:
        data['FamilyName'] = data['Name'].str.split().str[-1]
        family_counts = data['FamilyName'].value_counts()
        data['FamilySize'] = data['FamilyName'].map(family_counts)
        data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    else:
        data['FamilyName'] = np.nan
        data['FamilySize'] = np.nan
        data['IsAlone'] = np.nan

    # PassengerId→グループ
    if 'PassengerId' in data.columns:
        passenger_split = data['PassengerId'].str.split('_', expand=True)
        data['PassengerGroup'] = passenger_split[0]
        data['PassengerNum'] = pd.to_numeric(passenger_split[1], errors='coerce')
    else:
        data['PassengerGroup'] = np.nan
        data['PassengerNum'] = np.nan

    return data

# =========================
# 2. 特徴量生成
# =========================
train_features = create_features(train)
test_features = create_features(test)

feature_cols = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP',
    'Side', 'Num',
    'TwoExpense', 'ThreeExpense', 'AgeGroup'
]

X_full = train_features[feature_cols].copy()
X_test_full = test_features[feature_cols].copy()

y_full = train['Transported']

# CatBoostに渡すカテゴリ列（列名）
categorical_features = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP',
    'Side', 'AgeGroup'
]

def fix_catboost_cats(df: pd.DataFrame, cat_cols: list[str]) -> pd.DataFrame:
    """CatBoost用にカテゴリ列のNaNを文字列に置換（NaN禁止のため）"""
    df = df.copy()
    for c in cat_cols:
        df[c] = df[c].astype('object')          # object化（bool/category混在OK）
        df[c] = df[c].where(df[c].notna(), 'NaN')  # NaN -> 'NaN'
    return df

X_full = fix_catboost_cats(X_full, categorical_features)
X_test_full = fix_catboost_cats(X_test_full, categorical_features)

# CatBoostは列インデックス指定
cat_features_idx = [feature_cols.index(c) for c in categorical_features]

print(f"使用特徴量数: {len(feature_cols)}")
print(f"特徴量一覧: {feature_cols}")
print(f"データシェイプ: Train {X_full.shape}, Test {X_test_full.shape}")

# =========================
# 3. Optuna 目的関数（CatBoost）
# =========================
def objective(trial: optuna.Trial) -> float:
    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'Logloss',
        'iterations': 600,  # LGBのn_estimatorsに相当
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'depth': 3,  # LGBのmax_depth=3に対応（固定）
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 1.0, log=True),
        'rsm': trial.suggest_float('rsm', 0.7, 1.0),  # 列サブサンプ（feature_fraction相当）
        'bootstrap_type': 'Bernoulli',
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),  # 行サブサンプ（bagging_fraction相当）

        # 追加（重要パラ）
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 12),

        'random_seed': 42,
        'verbose': False,
        'allow_writing_files': False,
    }

    # PassengerGroupでグループ分割（元コード踏襲）
    groups = train_features['PassengerGroup'].astype(str)
    cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

    fold_scores = []
    for fold_idx, (trn_idx, val_idx) in enumerate(cv.split(X_full, y_full, groups=groups), start=1):
        X_trn, X_val = X_full.iloc[trn_idx], X_full.iloc[val_idx]
        y_trn, y_val = y_full.iloc[trn_idx], y_full.iloc[val_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            X_trn, y_trn,
            eval_set=(X_val, y_val),
            cat_features=cat_features_idx,
            use_best_model=True,
            early_stopping_rounds=50,
            verbose=False,
        )

        # 予測（クラスラベル）
        y_pred = model.predict(X_val, prediction_type="Class")
        y_pred = np.array(y_pred).ravel().astype(int)

        acc = accuracy_score(y_val, y_pred)
        fold_scores.append(acc)

        trial.report(float(np.mean(fold_scores)), step=fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(fold_scores))

# =========================
# 4. Optuna 実行
# =========================
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=2)
study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("\n=== Optuna 結果 ===")
print(f"Best CV accuracy: {study.best_value:.5f}")
for k, v in study.best_params.items():
    print(f"{k}: {v}")

# =========================
# 5. 最終モデル学習
# =========================
best_params = {
    **study.best_params,
    'loss_function': 'Logloss',
    'eval_metric': 'Logloss',
    'iterations': 600,
    'depth': 3,
    'bootstrap_type': 'Bernoulli',
    'random_seed': 42,
    'verbose': False,
    'allow_writing_files': False,
}
final_model = cb.CatBoostClassifier(**best_params)
final_model.fit(
    X_full, y_full,
    cat_features=cat_features_idx,
    verbose=False
)
print("\n最終モデル学習完了")


[I 2025-10-20 00:41:35,691] A new study created in memory with name: no-name-da836c16-ba01-4083-99ec-05eece4bf9e5


使用特徴量数: 9
特徴量一覧: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side', 'Num', 'TwoExpense', 'ThreeExpense', 'AgeGroup']
データシェイプ: Train (8693, 9), Test (4277, 9)


Best trial: 0. Best value: 0.802362:   2%|▎         | 1/40 [00:06<03:56,  6.07s/it]

[I 2025-10-20 00:41:41,760] Trial 0 finished with value: 0.8023615285731726 and parameters: {'learning_rate': 0.02757359293934948, 'l2_leaf_reg': 0.7114476009343417, 'rsm': 0.9195981825434215, 'subsample': 0.8795975452591109, 'random_strength': 0.15601864044243652, 'one_hot_max_size': 3}. Best is trial 0 with value: 0.8023615285731726.


Best trial: 0. Best value: 0.802362:   5%|▌         | 2/40 [00:08<02:35,  4.08s/it]

[I 2025-10-20 00:41:44,448] Trial 1 finished with value: 0.8004236009626837 and parameters: {'learning_rate': 0.011703388679635262, 'l2_leaf_reg': 0.39676050770529864, 'rsm': 0.8803345035229626, 'subsample': 0.9124217733388136, 'random_strength': 0.020584494295802447, 'one_hot_max_size': 12}. Best is trial 0 with value: 0.8023615285731726.


Best trial: 2. Best value: 0.806803:   8%|▊         | 3/40 [00:10<01:50,  2.99s/it]

[I 2025-10-20 00:41:46,132] Trial 2 finished with value: 0.8068028796934463 and parameters: {'learning_rate': 0.09528587217040241, 'l2_leaf_reg': 0.004335281794951566, 'rsm': 0.7545474901621302, 'subsample': 0.7550213529560301, 'random_strength': 0.3042422429595377, 'one_hot_max_size': 7}. Best is trial 2 with value: 0.8068028796934463.


Best trial: 2. Best value: 0.806803:  10%|█         | 4/40 [00:13<01:45,  2.92s/it]

[I 2025-10-20 00:41:48,962] Trial 3 finished with value: 0.8062552821948696 and parameters: {'learning_rate': 0.032211189352044464, 'l2_leaf_reg': 0.007476312062252299, 'rsm': 0.8835558684167139, 'subsample': 0.7418481581956126, 'random_strength': 0.29214464853521815, 'one_hot_max_size': 6}. Best is trial 2 with value: 0.8068028796934463.


Best trial: 2. Best value: 0.806803:  12%|█▎        | 5/40 [00:19<02:22,  4.07s/it]

[I 2025-10-20 00:41:55,050] Trial 4 finished with value: 0.8055895874141614 and parameters: {'learning_rate': 0.03438586247938296, 'l2_leaf_reg': 0.22673986523780396, 'rsm': 0.7599021346475079, 'subsample': 0.8542703315240835, 'random_strength': 0.5924145688620425, 'one_hot_max_size': 2}. Best is trial 2 with value: 0.8068028796934463.


Best trial: 5. Best value: 0.80923:  15%|█▌        | 6/40 [00:21<01:56,  3.44s/it] 

[I 2025-10-20 00:41:57,279] Trial 5 finished with value: 0.8092304778391529 and parameters: {'learning_rate': 0.05182367293641893, 'l2_leaf_reg': 0.0032476735706274485, 'rsm': 0.7195154778955838, 'subsample': 0.984665661176, 'random_strength': 0.9656320330745594, 'one_hot_max_size': 10}. Best is trial 5 with value: 0.8092304778391529.


Best trial: 5. Best value: 0.80923:  18%|█▊        | 7/40 [00:22<01:29,  2.73s/it]

[I 2025-10-20 00:41:58,537] Trial 6 pruned. 


Best trial: 5. Best value: 0.80923:  20%|██        | 8/40 [00:24<01:12,  2.27s/it]

[I 2025-10-20 00:41:59,819] Trial 7 pruned. 


Best trial: 5. Best value: 0.80923:  22%|██▎       | 9/40 [00:25<01:03,  2.04s/it]

[I 2025-10-20 00:42:01,367] Trial 8 pruned. 


Best trial: 5. Best value: 0.80923:  25%|██▌       | 10/40 [00:28<01:12,  2.41s/it]

[I 2025-10-20 00:42:04,593] Trial 9 pruned. 


Best trial: 10. Best value: 0.809346:  28%|██▊       | 11/40 [00:30<00:59,  2.06s/it]

[I 2025-10-20 00:42:05,864] Trial 10 finished with value: 0.8093461563176373 and parameters: {'learning_rate': 0.13279615079267518, 'l2_leaf_reg': 0.026301587628514228, 'rsm': 0.8106186318163292, 'subsample': 0.9865434487124839, 'random_strength': 0.983896235420326, 'one_hot_max_size': 10}. Best is trial 10 with value: 0.8093461563176373.


Best trial: 10. Best value: 0.809346:  30%|███       | 12/40 [00:31<00:53,  1.91s/it]

[I 2025-10-20 00:42:07,423] Trial 11 finished with value: 0.8080116928087996 and parameters: {'learning_rate': 0.14288586378468338, 'l2_leaf_reg': 0.032675304360081175, 'rsm': 0.8178286784763787, 'subsample': 0.9992330587139621, 'random_strength': 0.995802187862515, 'one_hot_max_size': 10}. Best is trial 10 with value: 0.8093461563176373.


Best trial: 10. Best value: 0.809346:  32%|███▎      | 13/40 [00:33<00:50,  1.88s/it]

[I 2025-10-20 00:42:09,227] Trial 12 finished with value: 0.8087548374104084 and parameters: {'learning_rate': 0.07654871885833515, 'l2_leaf_reg': 0.021367383067056468, 'rsm': 0.802754798121327, 'subsample': 0.9898870970010124, 'random_strength': 0.8233930707242936, 'one_hot_max_size': 9}. Best is trial 10 with value: 0.8093461563176373.


Best trial: 13. Best value: 0.809831:  35%|███▌      | 14/40 [00:34<00:44,  1.73s/it]

[I 2025-10-20 00:42:10,606] Trial 13 finished with value: 0.8098310087904841 and parameters: {'learning_rate': 0.1311288077894947, 'l2_leaf_reg': 0.07855335469125711, 'rsm': 0.7183875990485815, 'subsample': 0.9589180921395233, 'random_strength': 0.6952232529049115, 'one_hot_max_size': 9}. Best is trial 13 with value: 0.8098310087904841.


Best trial: 13. Best value: 0.809831:  38%|███▊      | 15/40 [00:36<00:40,  1.60s/it]

[I 2025-10-20 00:42:11,923] Trial 14 finished with value: 0.8094586472123669 and parameters: {'learning_rate': 0.14548630146186795, 'l2_leaf_reg': 0.1260072199748943, 'rsm': 0.7013297138299369, 'subsample': 0.9624007735264689, 'random_strength': 0.7085103656807009, 'one_hot_max_size': 9}. Best is trial 13 with value: 0.8098310087904841.


Best trial: 13. Best value: 0.809831:  40%|████      | 16/40 [00:38<00:40,  1.68s/it]

[I 2025-10-20 00:42:13,780] Trial 15 finished with value: 0.8088040959339654 and parameters: {'learning_rate': 0.08725331164726309, 'l2_leaf_reg': 0.11467478853113353, 'rsm': 0.7076648484431629, 'subsample': 0.9449901657199126, 'random_strength': 0.6698279556395575, 'one_hot_max_size': 9}. Best is trial 13 with value: 0.8098310087904841.


Best trial: 13. Best value: 0.809831:  42%|████▎     | 17/40 [00:39<00:33,  1.45s/it]

[I 2025-10-20 00:42:14,709] Trial 16 pruned. 


Best trial: 13. Best value: 0.809831:  45%|████▌     | 18/40 [00:41<00:36,  1.67s/it]

[I 2025-10-20 00:42:16,874] Trial 17 pruned. 


Best trial: 13. Best value: 0.809831:  48%|████▊     | 19/40 [00:42<00:30,  1.44s/it]

[I 2025-10-20 00:42:17,769] Trial 18 pruned. 


Best trial: 13. Best value: 0.809831:  50%|█████     | 20/40 [00:44<00:35,  1.77s/it]

[I 2025-10-20 00:42:20,329] Trial 19 finished with value: 0.8090302017985556 and parameters: {'learning_rate': 0.06381068560810489, 'l2_leaf_reg': 0.2253500899803306, 'rsm': 0.8454147319972026, 'subsample': 0.8723282348750757, 'random_strength': 0.840537191728312, 'one_hot_max_size': 8}. Best is trial 13 with value: 0.8098310087904841.


Best trial: 13. Best value: 0.809831:  52%|█████▎    | 21/40 [00:46<00:31,  1.66s/it]

[I 2025-10-20 00:42:21,710] Trial 20 pruned. 


Best trial: 21. Best value: 0.810036:  55%|█████▌    | 22/40 [00:47<00:28,  1.58s/it]

[I 2025-10-20 00:42:23,119] Trial 21 finished with value: 0.8100361554619809 and parameters: {'learning_rate': 0.14298858658572536, 'l2_leaf_reg': 0.01984776103344128, 'rsm': 0.830829950071684, 'subsample': 0.9728671891290068, 'random_strength': 0.7584062787941209, 'one_hot_max_size': 10}. Best is trial 21 with value: 0.8100361554619809.


Best trial: 21. Best value: 0.810036:  57%|█████▊    | 23/40 [00:48<00:23,  1.40s/it]

[I 2025-10-20 00:42:24,102] Trial 22 pruned. 


Best trial: 21. Best value: 0.810036:  60%|██████    | 24/40 [00:49<00:19,  1.21s/it]

[I 2025-10-20 00:42:24,851] Trial 23 pruned. 


Best trial: 21. Best value: 0.810036:  62%|██████▎   | 25/40 [00:50<00:17,  1.14s/it]

[I 2025-10-20 00:42:25,838] Trial 24 pruned. 


Best trial: 21. Best value: 0.810036:  65%|██████▌   | 26/40 [00:50<00:14,  1.04s/it]

[I 2025-10-20 00:42:26,651] Trial 25 pruned. 


Best trial: 21. Best value: 0.810036:  68%|██████▊   | 27/40 [00:51<00:12,  1.04it/s]

[I 2025-10-20 00:42:27,420] Trial 26 pruned. 


Best trial: 21. Best value: 0.810036:  70%|███████   | 28/40 [00:53<00:13,  1.16s/it]

[I 2025-10-20 00:42:29,049] Trial 27 pruned. 


Best trial: 21. Best value: 0.810036:  72%|███████▎  | 29/40 [00:54<00:13,  1.23s/it]

[I 2025-10-20 00:42:30,445] Trial 28 finished with value: 0.8083717013057946 and parameters: {'learning_rate': 0.12596369362725196, 'l2_leaf_reg': 0.013190622031389281, 'rsm': 0.8306474970051926, 'subsample': 0.8868678110399549, 'random_strength': 0.7791181190512122, 'one_hot_max_size': 11}. Best is trial 21 with value: 0.8100361554619809.


Best trial: 21. Best value: 0.810036:  75%|███████▌  | 30/40 [00:56<00:12,  1.26s/it]

[I 2025-10-20 00:42:31,788] Trial 29 pruned. 


Best trial: 21. Best value: 0.810036:  78%|███████▊  | 31/40 [00:58<00:13,  1.48s/it]

[I 2025-10-20 00:42:33,779] Trial 30 pruned. 


Best trial: 31. Best value: 0.810674:  80%|████████  | 32/40 [00:59<00:11,  1.41s/it]

[I 2025-10-20 00:42:35,018] Trial 31 finished with value: 0.8106736789042326 and parameters: {'learning_rate': 0.1469444268239764, 'l2_leaf_reg': 0.029549134596372463, 'rsm': 0.8251054272088577, 'subsample': 0.9985683049575292, 'random_strength': 0.8939262780900099, 'one_hot_max_size': 10}. Best is trial 31 with value: 0.8106736789042326.


Best trial: 31. Best value: 0.810674:  82%|████████▎ | 33/40 [01:00<00:08,  1.24s/it]

[I 2025-10-20 00:42:35,858] Trial 32 pruned. 


Best trial: 31. Best value: 0.810674:  85%|████████▌ | 34/40 [01:01<00:06,  1.16s/it]

[I 2025-10-20 00:42:36,823] Trial 33 pruned. 


Best trial: 31. Best value: 0.810674:  88%|████████▊ | 35/40 [01:01<00:05,  1.07s/it]

[I 2025-10-20 00:42:37,674] Trial 34 pruned. 


Best trial: 31. Best value: 0.810674:  90%|█████████ | 36/40 [01:03<00:04,  1.13s/it]

[I 2025-10-20 00:42:38,959] Trial 35 pruned. 


Best trial: 31. Best value: 0.810674:  92%|█████████▎| 37/40 [01:05<00:04,  1.37s/it]

[I 2025-10-20 00:42:40,899] Trial 36 finished with value: 0.8096112489686735 and parameters: {'learning_rate': 0.08245437508776873, 'l2_leaf_reg': 0.3073519792979251, 'rsm': 0.8921188212816688, 'subsample': 0.9748795855257966, 'random_strength': 0.4288084220895492, 'one_hot_max_size': 8}. Best is trial 31 with value: 0.8106736789042326.


Best trial: 31. Best value: 0.810674:  95%|█████████▌| 38/40 [01:06<00:02,  1.25s/it]

[I 2025-10-20 00:42:41,865] Trial 37 pruned. 


Best trial: 31. Best value: 0.810674:  98%|█████████▊| 39/40 [01:07<00:01,  1.34s/it]

[I 2025-10-20 00:42:43,420] Trial 38 finished with value: 0.8089459100087287 and parameters: {'learning_rate': 0.10102941245139367, 'l2_leaf_reg': 0.030621781901493637, 'rsm': 0.959730278205339, 'subsample': 0.9454306910677724, 'random_strength': 0.405317946886742, 'one_hot_max_size': 7}. Best is trial 31 with value: 0.8106736789042326.


Best trial: 31. Best value: 0.810674: 100%|██████████| 40/40 [01:08<00:00,  1.71s/it]


[I 2025-10-20 00:42:44,281] Trial 39 pruned. 

=== Optuna 結果 ===
Best CV accuracy: 0.81067
learning_rate: 0.1469444268239764
l2_leaf_reg: 0.029549134596372463
rsm: 0.8251054272088577
subsample: 0.9985683049575292
random_strength: 0.8939262780900099
one_hot_max_size: 10

最終モデル学習完了


In [20]:
preds = final_model.predict(X_test_full)


passenger_ids = test["PassengerId"]

# DataFrame にまとめる
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": preds
})

# CSV 出力（インデックスは不要）
submission.to_csv("submission_catboost.csv", index=False)

submission['Transported'].value_counts()

Transported
True     2179
False    2098
Name: count, dtype: int64

## catboost byさわぐち

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import accuracy_score
import catboost as cb
import optuna
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

# =========================
# 1. データ読み込み
# =========================
train = pd.read_csv("../dataset/train.csv")
test = pd.read_csv("../dataset/test.csv")

def create_features(df: pd.DataFrame) -> pd.DataFrame:
    """欠損値を補完せずに特徴量を生成する"""
    data = df.copy()

    # Cabin情報を分解（欠損はNaNのまま保持）
    if 'Cabin' in data.columns:
        cabin_split = data['Cabin'].str.split('/', expand=True)
        data['Deck'] = cabin_split[0]
        data['Num'] = pd.to_numeric(cabin_split[1], errors='coerce')  # NaNそのまま
        data['Side'] = cabin_split[2]
    else:
        data['Deck'] = np.nan
        data['Num'] = np.nan
        data['Side'] = np.nan

    # 支出関連特徴量（NaNはそのまま残す）
    expense_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    two_expense = ['FoodCourt', 'ShoppingMall']
    three_expense = ['RoomService', 'Spa', 'VRDeck']
    for col in expense_cols:
        if col not in data.columns:
            data[col] = np.nan

    data['TotalExpense'] = data[expense_cols].sum(axis=1, skipna=False)
    data['TwoExpense'] = data[two_expense].sum(axis=1, skipna=False)
    data['ThreeExpense'] = data[three_expense].sum(axis=1, skipna=False)
    data['HasExpense'] = data[expense_cols].gt(0).any(axis=1).astype(int)

    # 年齢グループ（NaNはNaNのまま）
    bins = [0, 5, 13, 18, 60, 100]
    labels = ['Child', 'Teen', 'Young', 'Adult', 'Senior']
    data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)

  # Name情報から家族サイズを生成（欠損はNaN）
    if 'Name' in data.columns:
        data['FamilyName'] = data['Name'].str.split().str[-1]
        family_counts = data['FamilyName'].value_counts()
        data['FamilySize'] = data['FamilyName'].map(family_counts)
        data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    # PassengerId情報（欠損時はNaN）
    if 'PassengerId' in data.columns:
        passenger_split = data['PassengerId'].str.split('_', expand=True)
        data['PassengerGroup'] = passenger_split[0]
        data['PassengerNum'] = pd.to_numeric(passenger_split[1], errors='coerce')
        
    # 組み合わせ特徴量の追加
    num_bins = [0, 250, 500, 750, 2000]
    num_labels = ['Q1', 'Q2', 'Q3', 'Q4']
    data['NumQuartile'] = pd.cut(data['Num'], bins=num_bins, labels=num_labels, right=False)
    data['Cabin_Loc'] = data['Deck'].astype(str) + '_' + data['NumQuartile'].astype(str)
    data['NoExpense_Cryo'] = ((data['CryoSleep'] == True) & (data['TotalExpense'] == 0)).astype(object)
    data['Cryo_VIP'] = data['CryoSleep'].astype(str) + '_' + data['VIP'].astype(str)
    data['Planet_AgeGroup'] = data['HomePlanet'].astype(str) + '_' + data['AgeGroup'].astype(str)
    data['Deck_Side'] = data['Deck'].astype(str) + '_' + data['Side'].astype(str)
    data['Full_Travel_Combo'] = (data['HomePlanet'].astype(str) + '_' + 
                                 data['Destination'].astype(str) + '_' + 
                                 data['CryoSleep'].astype(str))
    if 'FamilyName' in data.columns:
        data['Family_Group'] = data['FamilyName'].astype(str) + '_' + data['PassengerGroup'].astype(str)
    else:
        data['Family_Group'] = np.nan
    return data
    
# =========================================================
# 2. 特徴量生成とデータ準備
# =========================================================
train_features = create_features(train)
test_features = create_features(test)

feature_cols = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP',
    'Side', 'Num',
    'TwoExpense', 'ThreeExpense', 'AgeGroup',
    'Cabin_Loc', 'NoExpense_Cryo', 'Cryo_VIP', 'Planet_AgeGroup',
    'Deck_Side', 'Full_Travel_Combo',
]

X_full = train_features[feature_cols].copy()
X_test_full = test_features[feature_cols].copy()
y_full = train['Transported']

categorical_features = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP',
    'Side', 'AgeGroup',
    'Cabin_Loc', 'NoExpense_Cryo', 'Cryo_VIP', 'Planet_AgeGroup',
    'Deck_Side', 'Full_Travel_Combo',
]

for col in categorical_features:
    X_full[col] = X_full[col].astype(str).replace('nan', 'NA').replace('None', 'NA')
    X_test_full[col] = X_test_full[col].astype(str).replace('nan', 'NA').replace('None', 'NA')

groups = train_features['PassengerGroup'].fillna('Unknown').astype(str)

print(f"使用特徴量数: {len(feature_cols)}")
print(f"特徴量一覧: {feature_cols}")
print(f"データシェイプ: Train {X_full.shape}, Test {X_test_full.shape}")


def fix_catboost_cats(df: pd.DataFrame, cat_cols: list[str]) -> pd.DataFrame:
    """CatBoost用にカテゴリ列のNaNを文字列に置換（NaN禁止のため）"""
    df = df.copy()
    for c in cat_cols:
        df[c] = df[c].astype('object')          # object化（bool/category混在OK）
        df[c] = df[c].where(df[c].notna(), 'NaN')  # NaN -> 'NaN'
    return df

X_full = fix_catboost_cats(X_full, categorical_features)
X_test_full = fix_catboost_cats(X_test_full, categorical_features)

# CatBoostは列インデックス指定
cat_features_idx = [feature_cols.index(c) for c in categorical_features]

print(f"使用特徴量数: {len(feature_cols)}")
print(f"特徴量一覧: {feature_cols}")
print(f"データシェイプ: Train {X_full.shape}, Test {X_test_full.shape}")

# =========================
# 3. Optuna 目的関数（CatBoost）
# =========================
def objective(trial: optuna.Trial) -> float:
    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'Logloss',
        'iterations': 600,  # LGBのn_estimatorsに相当
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'depth': 3,  # LGBのmax_depth=3に対応（固定）
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 1.0, log=True),
        'rsm': trial.suggest_float('rsm', 0.7, 1.0),  # 列サブサンプ（feature_fraction相当）
        'bootstrap_type': 'Bernoulli',
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),  # 行サブサンプ（bagging_fraction相当）

        # 追加（重要パラ）
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 12),

        'random_seed': 42,
        'verbose': False,
        'allow_writing_files': False,
    }

    # PassengerGroupでグループ分割（元コード踏襲）
    groups = train_features['PassengerGroup'].astype(str)
    cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

    fold_scores = []
    for fold_idx, (trn_idx, val_idx) in enumerate(cv.split(X_full, y_full, groups=groups), start=1):
        X_trn, X_val = X_full.iloc[trn_idx], X_full.iloc[val_idx]
        y_trn, y_val = y_full.iloc[trn_idx], y_full.iloc[val_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            X_trn, y_trn,
            eval_set=(X_val, y_val),
            cat_features=cat_features_idx,
            use_best_model=True,
            early_stopping_rounds=50,
            verbose=False,
        )

        # 予測（クラスラベル）
        y_pred = model.predict(X_val, prediction_type="Class")
        y_pred = np.array(y_pred).ravel().astype(int)

        acc = accuracy_score(y_val, y_pred)
        fold_scores.append(acc)

        trial.report(float(np.mean(fold_scores)), step=fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(fold_scores))

# =========================
# 4. Optuna 実行
# =========================
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=2)
study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("\n=== Optuna 結果 ===")
print(f"Best CV accuracy: {study.best_value:.5f}")
for k, v in study.best_params.items():
    print(f"{k}: {v}")

# =========================
# 5. 最終モデル学習
# =========================
best_params = {
    **study.best_params,
    'loss_function': 'Logloss',
    'eval_metric': 'Logloss',
    'iterations': 600,
    'depth': 3,
    'bootstrap_type': 'Bernoulli',
    'random_seed': 42,
    'verbose': False,
    'allow_writing_files': False,
}
final_model = cb.CatBoostClassifier(**best_params)
final_model.fit(
    X_full, y_full,
    cat_features=cat_features_idx,
    verbose=False
)
print("\n最終モデル学習完了")


[I 2025-10-20 00:44:40,429] A new study created in memory with name: no-name-9eb55eee-e1e2-4fd8-9d99-6dadce7be032


使用特徴量数: 15
特徴量一覧: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side', 'Num', 'TwoExpense', 'ThreeExpense', 'AgeGroup', 'Cabin_Loc', 'NoExpense_Cryo', 'Cryo_VIP', 'Planet_AgeGroup', 'Deck_Side', 'Full_Travel_Combo']
データシェイプ: Train (8693, 15), Test (4277, 15)
使用特徴量数: 15
特徴量一覧: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side', 'Num', 'TwoExpense', 'ThreeExpense', 'AgeGroup', 'Cabin_Loc', 'NoExpense_Cryo', 'Cryo_VIP', 'Planet_AgeGroup', 'Deck_Side', 'Full_Travel_Combo']
データシェイプ: Train (8693, 15), Test (4277, 15)


Best trial: 0. Best value: 0.813957:   2%|▎         | 1/40 [00:07<04:42,  7.24s/it]

[I 2025-10-20 00:44:47,665] Trial 0 finished with value: 0.8139566998237306 and parameters: {'learning_rate': 0.02757359293934948, 'l2_leaf_reg': 0.7114476009343417, 'rsm': 0.9195981825434215, 'subsample': 0.8795975452591109, 'random_strength': 0.15601864044243652, 'one_hot_max_size': 3}. Best is trial 0 with value: 0.8139566998237306.


Best trial: 0. Best value: 0.813957:   5%|▌         | 2/40 [00:12<03:59,  6.31s/it]

[I 2025-10-20 00:44:53,325] Trial 1 finished with value: 0.8055130291676159 and parameters: {'learning_rate': 0.011703388679635262, 'l2_leaf_reg': 0.39676050770529864, 'rsm': 0.8803345035229626, 'subsample': 0.9124217733388136, 'random_strength': 0.020584494295802447, 'one_hot_max_size': 12}. Best is trial 0 with value: 0.8139566998237306.


Best trial: 2. Best value: 0.81733:   8%|▊         | 3/40 [00:17<03:20,  5.43s/it] 

[I 2025-10-20 00:44:57,701] Trial 2 finished with value: 0.8173299401798364 and parameters: {'learning_rate': 0.09528587217040241, 'l2_leaf_reg': 0.004335281794951566, 'rsm': 0.7545474901621302, 'subsample': 0.7550213529560301, 'random_strength': 0.3042422429595377, 'one_hot_max_size': 7}. Best is trial 2 with value: 0.8173299401798364.


Best trial: 2. Best value: 0.81733:  10%|█         | 4/40 [00:23<03:24,  5.69s/it]

[I 2025-10-20 00:45:03,802] Trial 3 finished with value: 0.8149843044228688 and parameters: {'learning_rate': 0.032211189352044464, 'l2_leaf_reg': 0.007476312062252299, 'rsm': 0.8835558684167139, 'subsample': 0.7418481581956126, 'random_strength': 0.29214464853521815, 'one_hot_max_size': 6}. Best is trial 2 with value: 0.8173299401798364.


Best trial: 2. Best value: 0.81733:  12%|█▎        | 5/40 [00:30<03:41,  6.32s/it]

[I 2025-10-20 00:45:11,222] Trial 4 finished with value: 0.8129269338141325 and parameters: {'learning_rate': 0.03438586247938296, 'l2_leaf_reg': 0.22673986523780396, 'rsm': 0.7599021346475079, 'subsample': 0.8542703315240835, 'random_strength': 0.5924145688620425, 'one_hot_max_size': 2}. Best is trial 2 with value: 0.8173299401798364.


Best trial: 2. Best value: 0.81733:  15%|█▌        | 6/40 [00:36<03:26,  6.07s/it]

[I 2025-10-20 00:45:16,817] Trial 5 finished with value: 0.8169829320833475 and parameters: {'learning_rate': 0.05182367293641893, 'l2_leaf_reg': 0.0032476735706274485, 'rsm': 0.7195154778955838, 'subsample': 0.984665661176, 'random_strength': 0.9656320330745594, 'one_hot_max_size': 10}. Best is trial 2 with value: 0.8173299401798364.


Best trial: 2. Best value: 0.81733:  18%|█▊        | 7/40 [00:39<02:43,  4.94s/it]

[I 2025-10-20 00:45:19,439] Trial 6 pruned. 


Best trial: 2. Best value: 0.81733:  20%|██        | 8/40 [00:41<02:14,  4.21s/it]

[I 2025-10-20 00:45:22,091] Trial 7 pruned. 


Best trial: 2. Best value: 0.81733:  22%|██▎       | 9/40 [00:47<02:26,  4.72s/it]

[I 2025-10-20 00:45:27,931] Trial 8 finished with value: 0.8151355273497067 and parameters: {'learning_rate': 0.04395225692486303, 'l2_leaf_reg': 0.0035856126103453977, 'rsm': 0.9908753883293676, 'subsample': 0.9325398470083344, 'random_strength': 0.9394989415641891, 'one_hot_max_size': 11}. Best is trial 2 with value: 0.8173299401798364.


Best trial: 2. Best value: 0.81733:  25%|██▌       | 10/40 [00:50<02:03,  4.12s/it]

[I 2025-10-20 00:45:30,715] Trial 9 pruned. 


Best trial: 2. Best value: 0.81733:  28%|██▊       | 11/40 [00:54<01:58,  4.07s/it]

[I 2025-10-20 00:45:34,668] Trial 10 finished with value: 0.8154646745834444 and parameters: {'learning_rate': 0.1413229303847895, 'l2_leaf_reg': 0.026301587628514228, 'rsm': 0.8106186318163292, 'subsample': 0.7053885626844458, 'random_strength': 0.6058717234330044, 'one_hot_max_size': 9}. Best is trial 2 with value: 0.8173299401798364.


Best trial: 2. Best value: 0.81733:  30%|███       | 12/40 [00:55<01:34,  3.36s/it]

[I 2025-10-20 00:45:36,397] Trial 11 pruned. 


Best trial: 2. Best value: 0.81733:  32%|███▎      | 13/40 [00:58<01:24,  3.14s/it]

[I 2025-10-20 00:45:39,042] Trial 12 pruned. 


Best trial: 2. Best value: 0.81733:  35%|███▌      | 14/40 [01:03<01:38,  3.80s/it]

[I 2025-10-20 00:45:44,358] Trial 13 finished with value: 0.8170407440217403 and parameters: {'learning_rate': 0.06886666208738036, 'l2_leaf_reg': 0.08297590936422079, 'rsm': 0.7503192490359177, 'subsample': 0.991319978628758, 'random_strength': 0.3801290318405802, 'one_hot_max_size': 10}. Best is trial 2 with value: 0.8173299401798364.


Best trial: 2. Best value: 0.81733:  38%|███▊      | 15/40 [01:06<01:24,  3.38s/it]

[I 2025-10-20 00:45:46,763] Trial 14 pruned. 


Best trial: 2. Best value: 0.81733:  40%|████      | 16/40 [01:09<01:19,  3.30s/it]

[I 2025-10-20 00:45:49,882] Trial 15 finished with value: 0.8171140215612931 and parameters: {'learning_rate': 0.14974124121928395, 'l2_leaf_reg': 0.06954167205816725, 'rsm': 0.8375839737488677, 'subsample': 0.951295734659954, 'random_strength': 0.45799852308238614, 'one_hot_max_size': 8}. Best is trial 2 with value: 0.8173299401798364.


Best trial: 2. Best value: 0.81733:  42%|████▎     | 17/40 [01:12<01:16,  3.31s/it]

[I 2025-10-20 00:45:53,199] Trial 16 finished with value: 0.8156272514030907 and parameters: {'learning_rate': 0.1449306631128225, 'l2_leaf_reg': 0.02758147757839373, 'rsm': 0.834563097795484, 'subsample': 0.9503322867479347, 'random_strength': 0.4834308548088349, 'one_hot_max_size': 8}. Best is trial 2 with value: 0.8173299401798364.


Best trial: 2. Best value: 0.81733:  45%|████▌     | 18/40 [01:14<01:03,  2.90s/it]

[I 2025-10-20 00:45:55,163] Trial 17 pruned. 


Best trial: 2. Best value: 0.81733:  48%|████▊     | 19/40 [01:16<00:55,  2.66s/it]

[I 2025-10-20 00:45:57,270] Trial 18 pruned. 


Best trial: 2. Best value: 0.81733:  50%|█████     | 20/40 [01:19<00:50,  2.54s/it]

[I 2025-10-20 00:45:59,533] Trial 19 pruned. 


Best trial: 2. Best value: 0.81733:  52%|█████▎    | 21/40 [01:22<00:50,  2.68s/it]

[I 2025-10-20 00:46:02,525] Trial 20 pruned. 


Best trial: 21. Best value: 0.818052:  55%|█████▌    | 22/40 [01:27<01:03,  3.51s/it]

[I 2025-10-20 00:46:07,986] Trial 21 finished with value: 0.8180524830825234 and parameters: {'learning_rate': 0.07326620568010764, 'l2_leaf_reg': 0.07633233443387065, 'rsm': 0.7497443966142497, 'subsample': 0.9441981116733494, 'random_strength': 0.4151440774191397, 'one_hot_max_size': 11}. Best is trial 21 with value: 0.8180524830825234.


Best trial: 21. Best value: 0.818052:  57%|█████▊    | 23/40 [01:32<01:07,  3.94s/it]

[I 2025-10-20 00:46:12,927] Trial 22 finished with value: 0.81733714560591 and parameters: {'learning_rate': 0.0919339393607771, 'l2_leaf_reg': 0.19938369265767078, 'rsm': 0.7841888108667638, 'subsample': 0.9563191414197301, 'random_strength': 0.4519666817183315, 'one_hot_max_size': 12}. Best is trial 21 with value: 0.8180524830825234.


Best trial: 21. Best value: 0.818052:  60%|██████    | 24/40 [01:34<00:55,  3.48s/it]

[I 2025-10-20 00:46:15,330] Trial 23 pruned. 


Best trial: 21. Best value: 0.818052:  62%|██████▎   | 25/40 [01:37<00:48,  3.22s/it]

[I 2025-10-20 00:46:17,940] Trial 24 pruned. 


Best trial: 21. Best value: 0.818052:  65%|██████▌   | 26/40 [01:39<00:40,  2.90s/it]

[I 2025-10-20 00:46:20,092] Trial 25 pruned. 


Best trial: 21. Best value: 0.818052:  68%|██████▊   | 27/40 [01:41<00:34,  2.65s/it]

[I 2025-10-20 00:46:22,145] Trial 26 pruned. 


Best trial: 21. Best value: 0.818052:  70%|███████   | 28/40 [01:44<00:32,  2.69s/it]

[I 2025-10-20 00:46:24,952] Trial 27 pruned. 


Best trial: 21. Best value: 0.818052:  72%|███████▎  | 29/40 [01:50<00:40,  3.68s/it]

[I 2025-10-20 00:46:30,917] Trial 28 finished with value: 0.8178209769152158 and parameters: {'learning_rate': 0.05661137572194417, 'l2_leaf_reg': 0.056071705001538866, 'rsm': 0.76242156599724, 'subsample': 0.8818533366556911, 'random_strength': 0.515734936576635, 'one_hot_max_size': 11}. Best is trial 21 with value: 0.8180524830825234.


Best trial: 29. Best value: 0.819072:  75%|███████▌  | 30/40 [01:56<00:43,  4.32s/it]

[I 2025-10-20 00:46:36,756] Trial 29 finished with value: 0.8190722393014198 and parameters: {'learning_rate': 0.05226171882074134, 'l2_leaf_reg': 0.9826740452177546, 'rsm': 0.8193592936801084, 'subsample': 0.8732774399616086, 'random_strength': 0.5471401318275659, 'one_hot_max_size': 11}. Best is trial 29 with value: 0.8190722393014198.


Best trial: 29. Best value: 0.819072:  78%|███████▊  | 31/40 [01:59<00:36,  4.10s/it]

[I 2025-10-20 00:46:40,342] Trial 30 pruned. 


Best trial: 29. Best value: 0.819072:  80%|████████  | 32/40 [02:05<00:37,  4.67s/it]

[I 2025-10-20 00:46:46,339] Trial 31 finished with value: 0.8183402268328364 and parameters: {'learning_rate': 0.05290883025615965, 'l2_leaf_reg': 0.9959567566960239, 'rsm': 0.7704320011076675, 'subsample': 0.8719870069036346, 'random_strength': 0.5462523303292887, 'one_hot_max_size': 12}. Best is trial 29 with value: 0.8190722393014198.


Best trial: 29. Best value: 0.819072:  82%|████████▎ | 33/40 [02:09<00:29,  4.21s/it]

[I 2025-10-20 00:46:49,475] Trial 32 pruned. 


Best trial: 29. Best value: 0.819072:  85%|████████▌ | 34/40 [02:11<00:22,  3.78s/it]

[I 2025-10-20 00:46:52,252] Trial 33 pruned. 


Best trial: 29. Best value: 0.819072:  88%|████████▊ | 35/40 [02:14<00:17,  3.47s/it]

[I 2025-10-20 00:46:55,008] Trial 34 pruned. 


Best trial: 29. Best value: 0.819072:  90%|█████████ | 36/40 [02:17<00:13,  3.26s/it]

[I 2025-10-20 00:46:57,756] Trial 35 pruned. 


Best trial: 29. Best value: 0.819072:  92%|█████████▎| 37/40 [02:19<00:09,  3.06s/it]

[I 2025-10-20 00:47:00,351] Trial 36 pruned. 


Best trial: 29. Best value: 0.819072:  95%|█████████▌| 38/40 [02:22<00:05,  2.90s/it]

[I 2025-10-20 00:47:02,882] Trial 37 pruned. 


Best trial: 29. Best value: 0.819072:  98%|█████████▊| 39/40 [02:25<00:03,  3.03s/it]

[I 2025-10-20 00:47:06,225] Trial 38 pruned. 


Best trial: 29. Best value: 0.819072: 100%|██████████| 40/40 [02:31<00:00,  3.78s/it]


[I 2025-10-20 00:47:11,719] Trial 39 finished with value: 0.8181557068877201 and parameters: {'learning_rate': 0.07033006166215403, 'l2_leaf_reg': 0.04816033445210995, 'rsm': 0.830020688510257, 'subsample': 0.836266505470355, 'random_strength': 0.6198649120051961, 'one_hot_max_size': 9}. Best is trial 29 with value: 0.8190722393014198.

=== Optuna 結果 ===
Best CV accuracy: 0.81907
learning_rate: 0.05226171882074134
l2_leaf_reg: 0.9826740452177546
rsm: 0.8193592936801084
subsample: 0.8732774399616086
random_strength: 0.5471401318275659
one_hot_max_size: 11

最終モデル学習完了


In [22]:
preds = final_model.predict(X_test_full)


passenger_ids = test["PassengerId"]

# DataFrame にまとめる
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": preds
})

# CSV 出力（インデックスは不要）
submission.to_csv("submission_catboost_many_features.csv", index=False)

submission['Transported'].value_counts()

Transported
True     2192
False    2085
Name: count, dtype: int64