In [49]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import catboost as cb

import optuna
from sklearn.model_selection import StratifiedGroupKFold

In [50]:
train = pd.read_csv("../dataset/train.csv")
test = pd.read_csv("../dataset/test.csv")

In [3]:
train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

# シンプルに3つだけ特徴量使った場合

In [4]:
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# =========================================================
# 1. 前処理（例: category型を適切に指定）
# =========================================================
X = train[['HomePlanet', 'CryoSleep', 'Destination']].copy()
X_test = test[['HomePlanet', 'CryoSleep', 'Destination']].copy()


cat_cols = ['HomePlanet', 'CryoSleep', 'Destination']
for c in cat_cols:
    X[c] = X[c].astype('category')
    X_test[c] = X_test[c].astype('category')

y = train['Transported']

# =========================================================
# 2. モデルとCV設定
# =========================================================
params = {
    'objective': 'binary',
    'metric': 'accuracy',
    'n_estimators': 500,
    'learning_rate': 0.01,
    'random_state': 40
}

model = lgb.LGBMClassifier(**params)

# StratifiedKFold: ラベル比を保つ5分割
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# =========================================================
# 3. クロスバリデーションループ
# =========================================================
scores = []
for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_valid)
    acc = accuracy_score(y_valid, y_pred)
    scores.append(acc)
    print(f"Fold {fold+1} accuracy: {acc:.5f}")

# =========================================================
# 4. 平均スコア
# =========================================================
print("====================================")
print(f"Mean CV accuracy: {sum(scores)/len(scores):.5f}")


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380
Fold 1 accuracy: 0.72283
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0

In [5]:
model.predict(X_test)

array([ True, False,  True, ...,  True, False,  True], shape=(4277,))

In [6]:
# =========================================================
# 5. 特定条件での予測結果分析
# =========================================================
print("=== 特定条件での予測結果分析 ===")

# テストデータで予測を実行
test_predictions = model.predict(X_test)
test_predictions_proba = model.predict_proba(X_test)[:, 1]  # Transported=1の確率

# テストデータに予測結果を追加
test_with_predictions = test.copy()
test_with_predictions['Predicted_Transported'] = test_predictions
test_with_predictions['Predicted_Proba'] = test_predictions_proba

print(f"テストデータのサンプル数: {len(test)}")
print(f"予測結果 - Transported=1の数: {sum(test_predictions)}")
print(f"予測結果 - Transported=0の数: {len(test_predictions) - sum(test_predictions)}")
print(f"全体のTransported予測率: {sum(test_predictions)/len(test_predictions):.3f}")

# 条件1: 冷凍状態 + 行き先が55 Cancri e
condition1 = (test_with_predictions['CryoSleep'] == True) & (test_with_predictions['Destination'] == '55 Cancri e')
cryo_55cancri = test_with_predictions[condition1]

print(f"\n=== 条件1: 冷凍状態 + 行き先55 Cancri e ===")
print(f"該当者数: {len(cryo_55cancri)}")
if len(cryo_55cancri) > 0:
    print(f"Transported予測=1の数: {sum(cryo_55cancri['Predicted_Transported'])}")
    print(f"Transported予測=0の数: {len(cryo_55cancri) - sum(cryo_55cancri['Predicted_Transported'])}")
    print(f"Transported予測率: {sum(cryo_55cancri['Predicted_Transported'])/len(cryo_55cancri):.3f}")
    print(f"平均予測確率: {cryo_55cancri['Predicted_Proba'].mean():.3f}")
    print(f"予測確率の範囲: {cryo_55cancri['Predicted_Proba'].min():.3f} - {cryo_55cancri['Predicted_Proba'].max():.3f}")
    
    # HomePlanet別の内訳
    print("\n--- HomePlanet別の内訳 ---")
    for planet in cryo_55cancri['HomePlanet'].unique():
        planet_data = cryo_55cancri[cryo_55cancri['HomePlanet'] == planet]
        if len(planet_data) > 0:
            print(f"{planet}: {len(planet_data)}人, Transported予測率: {sum(planet_data['Predicted_Transported'])/len(planet_data):.3f}")
else:
    print("該当者なし")

# 条件2: 冷凍状態 + HomePlanet=Europa
condition2 = (test_with_predictions['CryoSleep'] == True) & (test_with_predictions['HomePlanet'] == 'Europa')
cryo_europa = test_with_predictions[condition2]

print(f"\n=== 条件2: 冷凍状態 + HomePlanet=Europa ===")
print(f"該当者数: {len(cryo_europa)}")
if len(cryo_europa) > 0:
    print(f"Transported予測=1の数: {sum(cryo_europa['Predicted_Transported'])}")
    print(f"Transported予測=0の数: {len(cryo_europa) - sum(cryo_europa['Predicted_Transported'])}")
    print(f"Transported予測率: {sum(cryo_europa['Predicted_Transported'])/len(cryo_europa):.3f}")
    print(f"平均予測確率: {cryo_europa['Predicted_Proba'].mean():.3f}")
    print(f"予測確率の範囲: {cryo_europa['Predicted_Proba'].min():.3f} - {cryo_europa['Predicted_Proba'].max():.3f}")
    
    # Destination別の内訳
    print("\n--- Destination別の内訳 ---")
    for dest in cryo_europa['Destination'].unique():
        dest_data = cryo_europa[cryo_europa['Destination'] == dest]
        if len(dest_data) > 0:
            print(f"{dest}: {len(dest_data)}人, Transported予測率: {sum(dest_data['Predicted_Transported'])/len(dest_data):.3f}")
else:
    print("該当者なし")

# 条件3: 冷凍状態 + HomePlanet=Europa + Destination=55 Cancri e（両方の条件を満たす）
condition3 = (test_with_predictions['CryoSleep'] == True) & \
             (test_with_predictions['HomePlanet'] == 'Europa') & \
             (test_with_predictions['Destination'] == '55 Cancri e')
cryo_europa_55cancri = test_with_predictions[condition3]

print(f"\n=== 条件3: 冷凍状態 + Europa + 55 Cancri e ===")
print(f"該当者数: {len(cryo_europa_55cancri)}")
if len(cryo_europa_55cancri) > 0:
    print(f"Transported予測=1の数: {sum(cryo_europa_55cancri['Predicted_Transported'])}")
    print(f"Transported予測=0の数: {len(cryo_europa_55cancri) - sum(cryo_europa_55cancri['Predicted_Transported'])}")
    print(f"Transported予測率: {sum(cryo_europa_55cancri['Predicted_Transported'])/len(cryo_europa_55cancri):.3f}")
    print(f"平均予測確率: {cryo_europa_55cancri['Predicted_Proba'].mean():.3f}")
else:
    print("該当者なし")

# 参考: 冷凍状態全体の予測結果
cryo_all = test_with_predictions[test_with_predictions['CryoSleep'] == True]
print(f"\n=== 参考: 冷凍状態全体 ===")
print(f"冷凍状態の人数: {len(cryo_all)}")
print(f"Transported予測率: {sum(cryo_all['Predicted_Transported'])/len(cryo_all):.3f}")
print(f"平均予測確率: {cryo_all['Predicted_Proba'].mean():.3f}")

=== 特定条件での予測結果分析 ===
テストデータのサンプル数: 4277
予測結果 - Transported=1の数: 1584
予測結果 - Transported=0の数: 2693
全体のTransported予測率: 0.370

=== 条件1: 冷凍状態 + 行き先55 Cancri e ===
該当者数: 364
Transported予測=1の数: 364
Transported予測=0の数: 0
Transported予測率: 1.000
平均予測確率: 0.927
予測確率の範囲: 0.755 - 0.997

--- HomePlanet別の内訳 ---
Europa: 217人, Transported予測率: 1.000
Earth: 94人, Transported予測率: 1.000
Mars: 44人, Transported予測率: 1.000

=== 条件2: 冷凍状態 + HomePlanet=Europa ===
該当者数: 463
Transported予測=1の数: 463
Transported予測=0の数: 0
Transported予測率: 1.000
平均予測確率: 0.987
予測確率の範囲: 0.978 - 0.997

--- Destination別の内訳 ---
55 Cancri e: 217人, Transported予測率: 1.000
TRAPPIST-1e: 230人, Transported予測率: 1.000
PSO J318.5-22: 7人, Transported予測率: 1.000

=== 条件3: 冷凍状態 + Europa + 55 Cancri e ===
該当者数: 217
Transported予測=1の数: 217
Transported予測=0の数: 0
Transported予測率: 1.000
平均予測確率: 0.997

=== 参考: 冷凍状態全体 ===
冷凍状態の人数: 1544
Transported予測率: 1.000
平均予測確率: 0.817


## できるだけ多くの特徴量使った場合

In [None]:
def create_features(df: pd.DataFrame) -> pd.DataFrame:
    """欠損値を補完せずに特徴量を生成する"""
    data = df.copy()

    # Cabin情報を分解（欠損はNaNのまま保持）
    if 'Cabin' in data.columns:
        cabin_split = data['Cabin'].str.split('/', expand=True)
        data['Deck'] = cabin_split[0]
        data['Num'] = pd.to_numeric(cabin_split[1], errors='coerce')  # NaNそのまま
        data['Side'] = cabin_split[2]
    else:
        data['Deck'] = np.nan
        data['Num'] = np.nan
        data['Side'] = np.nan

    # 支出関連特徴量（NaNはそのまま残す）
    expense_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in expense_cols:
        if col not in data.columns:
            data[col] = np.nan

    # 総支出額（NaNのまま計算 → NaNを含む場合はNaN）
    data['TotalExpense'] = data[expense_cols].sum(axis=1, skipna=False)

    # 支出が一つでもあるか（NaNを無視）
    data['HasExpense'] = data[expense_cols].gt(0).any(axis=1).astype(int)

    # 年齢グループ（NaNは'Unknown'にせず、NaNのまま）
    bins = [0, 12, 18, 30, 50, 100]
    labels = ['Child', 'Teen', 'Young', 'Adult', 'Senior']
    data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels)

    # Name情報から家族サイズを生成（欠損はNaN）
    if 'Name' in data.columns:
        data['FamilyName'] = data['Name'].str.split().str[-1]
        family_counts = data['FamilyName'].value_counts()
        data['FamilySize'] = data['FamilyName'].map(family_counts)
        data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    else:
        data['FamilyName'] = np.nan
        data['FamilySize'] = np.nan
        data['IsAlone'] = np.nan

    # PassengerId情報（欠損時はNaN）
    if 'PassengerId' in data.columns:
        passenger_split = data['PassengerId'].str.split('_', expand=True)
        data['PassengerGroup'] = passenger_split[0]
        data['PassengerNum'] = pd.to_numeric(passenger_split[1], errors='coerce')
    else:
        data['PassengerGroup'] = np.nan
        data['PassengerNum'] = np.nan

    return data

In [8]:

# =========================================================
# 2. 特徴量生成
# =========================================================
train_features = create_features(train)
test_features = create_features(test)

feature_cols = [
    'HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
    'Deck', 'Side', 'Num',
    'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'TotalExpense', 'HasExpense', 'AgeGroup', 'FamilySize', 'IsAlone'
]

X_full = train_features[feature_cols].copy()
X_test_full = test_features[feature_cols].copy()
y_full = train['Transported']

# カテゴリ列をcategory型に変換（NaNも保持）
categorical_features = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP',
    'Deck', 'Side', 'AgeGroup'
]

for col in categorical_features:
    X_full[col] = X_full[col].astype('category')
    X_test_full[col] = X_test_full[col].astype('category')

print(f"使用特徴量数: {len(feature_cols)}")
print(f"特徴量一覧: {feature_cols}")
print(f"データシェイプ: Train {X_full.shape}, Test {X_test_full.shape}")

# =========================================================
# 3. LightGBM設定 + 5分割CV
# =========================================================
params_full = {
    'objective': 'binary',
    'metric': 'accuracy',
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 6,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'random_state': 42,
    'verbose': -1,
}

cv_full = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_full = []
feature_importance_list = []

print("\n=== 5-Fold Cross Validation (欠損値未補完版) ===")
for fold, (train_idx, valid_idx) in enumerate(cv_full.split(X_full, y_full)):
    X_train_full, X_valid_full = X_full.iloc[train_idx], X_full.iloc[valid_idx]
    y_train_full, y_valid_full = y_full.iloc[train_idx], y_full.iloc[valid_idx]

    model = lgb.LGBMClassifier(**params_full)
    model.fit(X_train_full, y_train_full)

    y_pred_full = model.predict(X_valid_full)
    acc_full = accuracy_score(y_valid_full, y_pred_full)
    scores_full.append(acc_full)

    feature_importance_list.append(model.feature_importances_)

    print(f"Fold {fold+1} accuracy: {acc_full:.5f}")

# =========================================================
# 4. 結果まとめ
# =========================================================
print("=" * 50)
print(f"Mean CV accuracy (欠損未補完): {np.mean(scores_full):.5f}")
print(f"Standard deviation: {np.std(scores_full):.5f}")

feature_importance_avg = np.mean(feature_importance_list, axis=0)
feature_importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': feature_importance_avg
}).sort_values('importance', ascending=False)

print("\n=== Top 10 重要特徴量 ===")
print(feature_importance_df.head(10).to_string(index=False))

# =========================================================
# 5. 全データで再学習
# =========================================================
print("\n=== 全データで最終モデルを学習 ===")
final_model = lgb.LGBMClassifier(**params_full)
final_model.fit(X_full, y_full)
print("最終モデル学習完了")


使用特徴量数: 18
特徴量一覧: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'Deck', 'Side', 'Num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalExpense', 'HasExpense', 'AgeGroup', 'FamilySize', 'IsAlone']
データシェイプ: Train (8693, 18), Test (4277, 18)

=== 5-Fold Cross Validation (欠損値未補完版) ===
Fold 1 accuracy: 0.80851
Fold 2 accuracy: 0.80679
Fold 3 accuracy: 0.80449
Fold 4 accuracy: 0.80437
Fold 5 accuracy: 0.79287
Mean CV accuracy (欠損未補完): 0.80340
Standard deviation: 0.00549

=== Top 10 重要特徴量 ===
     feature  importance
         Num      5549.2
         Age      3766.8
TotalExpense      3067.0
  FamilySize      2313.6
   FoodCourt      1945.2
ShoppingMall      1912.8
         Spa      1891.4
      VRDeck      1856.8
 RoomService      1838.2
 Destination       727.8

=== 全データで最終モデルを学習 ===
最終モデル学習完了


## 特徴量を吟味した場合

最終的にここにある特徴量で出してる
feature_cols = [
    'HomePlanet', 'CryoSleep', 'Destination',  'VIP',
    'Side', 'Num',
    'FoodCourt', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck',
    'TotalExpense', 'AgeGroup'
]



In [51]:
def create_features(df: pd.DataFrame) -> pd.DataFrame:
    """欠損値を補完せずに特徴量を生成する"""
    data = df.copy()

    # Cabin情報を分解（欠損はNaNのまま保持）
    if 'Cabin' in data.columns:
        cabin_split = data['Cabin'].str.split('/', expand=True)
        data['Deck'] = cabin_split[0]
        data['Num'] = pd.to_numeric(cabin_split[1], errors='coerce')  # NaNそのまま
        data['Side'] = cabin_split[2]
    else:
        data['Deck'] = np.nan
        data['Num'] = np.nan
        data['Side'] = np.nan

    # 支出関連特徴量（NaNはそのまま残す）
    expense_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in expense_cols:
        if col not in data.columns:
            data[col] = np.nan

    # 総支出額（NaNのまま計算 → NaNを含む場合はNaN）
    data['TotalExpense'] = data[expense_cols].sum(axis=1, skipna=False)

    # 支出が一つでもあるか（NaNを無視）
    data['HasExpense'] = data[expense_cols].gt(0).any(axis=1).astype(int)

    # 年齢グループ（NaNは'Unknown'にせず、NaNのまま）
    bins = [0, 12, 18, 30, 50, 100]
    labels = ['Child', 'Teen', 'Young', 'Adult', 'Senior']
    data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels)

    # Name情報から家族サイズを生成（欠損はNaN）
    if 'Name' in data.columns:
        data['FamilyName'] = data['Name'].str.split().str[-1]
        family_counts = data['FamilyName'].value_counts()
        data['FamilySize'] = data['FamilyName'].map(family_counts)
        data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    else:
        data['FamilyName'] = np.nan
        data['FamilySize'] = np.nan
        data['IsAlone'] = np.nan

    # PassengerId情報（欠損時はNaN）
    if 'PassengerId' in data.columns:
        passenger_split = data['PassengerId'].str.split('_', expand=True)
        data['PassengerGroup'] = passenger_split[0]
        data['PassengerNum'] = pd.to_numeric(passenger_split[1], errors='coerce')
    else:
        data['PassengerGroup'] = np.nan
        data['PassengerNum'] = np.nan

    return data

In [17]:

# =========================================================
# 2. 特徴量生成
# =========================================================
train_features = create_features(train)
test_features = create_features(test)

feature_cols = [
    'HomePlanet', 'CryoSleep', 'Destination',  'VIP',
    'Side', 'Num',
    'FoodCourt', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck',
    'TotalExpense', 'AgeGroup'
]
# 'Age', 'Deck', 'FamilySize', 'IsAlone', 'HasExpense',

X_full = train_features[feature_cols].copy()
X_test_full = test_features[feature_cols].copy()
y_full = train['Transported']

# カテゴリ列をcategory型に変換（NaNも保持）
categorical_features = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP',
    'Side', 'AgeGroup'
]

for col in categorical_features:
    X_full[col] = X_full[col].astype('category')
    X_test_full[col] = X_test_full[col].astype('category')

print(f"使用特徴量数: {len(feature_cols)}")
print(f"特徴量一覧: {feature_cols}")
print(f"データシェイプ: Train {X_full.shape}, Test {X_test_full.shape}")

# =========================================================
# 3. LightGBM設定 + 5分割CV
# =========================================================
params_full = {
    'objective': 'binary',
    'metric': 'accuracy',
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 6,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'random_state': 42,
    'verbose': -1,
}

cv_full = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_full = []
feature_importance_list = []

print("\n=== 5-Fold Cross Validation (欠損値未補完版) ===")
for fold, (train_idx, valid_idx) in enumerate(cv_full.split(X_full, y_full)):
    X_train_full, X_valid_full = X_full.iloc[train_idx], X_full.iloc[valid_idx]
    y_train_full, y_valid_full = y_full.iloc[train_idx], y_full.iloc[valid_idx]

    model = lgb.LGBMClassifier(**params_full)
    model.fit(X_train_full, y_train_full)

    y_pred_full = model.predict(X_valid_full)
    acc_full = accuracy_score(y_valid_full, y_pred_full)
    scores_full.append(acc_full)

    feature_importance_list.append(model.feature_importances_)

    print(f"Fold {fold+1} accuracy: {acc_full:.5f}")

# =========================================================
# 4. 結果まとめ
# =========================================================
print("=" * 50)
print(f"Mean CV accuracy (欠損未補完): {np.mean(scores_full):.5f}")
print(f"Standard deviation: {np.std(scores_full):.5f}")

feature_importance_avg = np.mean(feature_importance_list, axis=0)
feature_importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': feature_importance_avg
}).sort_values('importance', ascending=False)

print("\n=== Top 10 重要特徴量 ===")
print(feature_importance_df.head(10).to_string(index=False))

# =========================================================
# 5. 全データで再学習
# =========================================================
print("\n=== 全データで最終モデルを学習 ===")
final_model = lgb.LGBMClassifier(**params_full)
final_model.fit(X_full, y_full)
print("最終モデル学習完了")


使用特徴量数: 13
特徴量一覧: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side', 'Num', 'FoodCourt', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalExpense', 'AgeGroup']
データシェイプ: Train (8693, 13), Test (4277, 13)

=== 5-Fold Cross Validation (欠損値未補完版) ===
Fold 1 accuracy: 0.79988
Fold 2 accuracy: 0.79298
Fold 3 accuracy: 0.79356
Fold 4 accuracy: 0.79689
Fold 5 accuracy: 0.78769
Mean CV accuracy (欠損未補完): 0.79420
Standard deviation: 0.00410

=== Top 10 重要特徴量 ===
     feature  importance
         Num      4254.0
TotalExpense      2620.6
      VRDeck      1814.6
ShoppingMall      1813.6
   FoodCourt      1794.2
         Spa      1776.2
 RoomService      1712.6
    AgeGroup      1024.2
  HomePlanet       580.2
        Side       542.0

=== 全データで最終モデルを学習 ===
最終モデル学習完了


In [18]:

def objective(trial: optuna.Trial) -> float:
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_estimators': 600,
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 40),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 120),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 1.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1.0, log=True),
        'random_state': 42,
        # ← 重要: metricはfitで与えるのでパラメータ側では無効化
        'metric': 'None',
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    for fold_idx, (trn_idx, val_idx) in enumerate(cv.split(X_full, y_full), start=1):
        X_trn, X_val = X_full.iloc[trn_idx], X_full.iloc[val_idx]
        y_trn, y_val = y_full.iloc[trn_idx], y_full.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)

        callbacks = [
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0),
        ]

        # 重要: eval_metric は 'binary_logloss' か 'auc' など公式名を指定
        model.fit(
            X_trn, y_trn,
            eval_set=[(X_val, y_val)],
            eval_metric='binary_logloss',
            categorical_feature=categorical_features,
            callbacks=callbacks,
        )

        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        fold_scores.append(acc)

        trial.report(float(np.mean(fold_scores)), step=fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(fold_scores))

# Optuna 実行（安定のTPE＋MedianPruner）
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=2)
study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("\n=== Optuna 結果 ===")
print(f"Best CV accuracy: {study.best_value:.5f}")
for k, v in study.best_params.items():
    print(f"{k}: {v}")

# 最終学習（全データ）: early stoppingは使わない
best_params = study.best_params | {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'n_estimators': 600,
    'metric': 'None',
    'random_state': 42,
}
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_full, y_full, categorical_feature=categorical_features)
print("\n最終モデル学習完了")


[I 2025-10-13 21:39:42,815] A new study created in memory with name: no-name-38e2753d-f34d-430f-aad5-d3647cc6e155
Best trial: 0. Best value: 0.804785:   2%|▎         | 1/40 [00:03<01:59,  3.06s/it]

[I 2025-10-13 21:39:45,877] Trial 0 finished with value: 0.804784570580423 and parameters: {'learning_rate': 0.047377279007281566, 'num_leaves': 39, 'max_depth': 5, 'min_data_in_leaf': 76, 'feature_fraction': 0.7468055921327309, 'bagging_fraction': 0.7467983561008608, 'bagging_freq': 1, 'lambda_l1': 0.39676050770529864, 'lambda_l2': 0.06358358856676251}. Best is trial 0 with value: 0.804784570580423.


Best trial: 1. Best value: 0.80651:   5%|▌         | 2/40 [00:04<01:24,  2.22s/it] 

[I 2025-10-13 21:39:47,514] Trial 1 finished with value: 0.8065095014462104 and parameters: {'learning_rate': 0.10211806418789511, 'num_leaves': 15, 'max_depth': 6, 'min_data_in_leaf': 102, 'feature_fraction': 0.7637017332034828, 'bagging_fraction': 0.7545474901621302, 'bagging_freq': 1, 'lambda_l1': 0.008179499475211672, 'lambda_l2': 0.03752055855124281}. Best is trial 1 with value: 0.8065095014462104.


Best trial: 1. Best value: 0.80651:   8%|▊         | 3/40 [00:07<01:33,  2.52s/it]

[I 2025-10-13 21:39:50,389] Trial 2 finished with value: 0.8038645015752476 and parameters: {'learning_rate': 0.05407232133324004, 'num_leaves': 22, 'max_depth': 5, 'min_data_in_leaf': 25, 'feature_fraction': 0.7876433945605654, 'bagging_fraction': 0.8099085529881075, 'bagging_freq': 3, 'lambda_l1': 0.22673986523780396, 'lambda_l2': 0.003972110727381913}. Best is trial 1 with value: 0.8065095014462104.


Best trial: 1. Best value: 0.80651:  10%|█         | 4/40 [00:09<01:18,  2.18s/it]

[I 2025-10-13 21:39:52,049] Trial 3 finished with value: 0.8024840010296515 and parameters: {'learning_rate': 0.06535283531563524, 'num_leaves': 30, 'max_depth': 3, 'min_data_in_leaf': 77, 'feature_fraction': 0.7511572371061874, 'bagging_fraction': 0.7195154778955838, 'bagging_freq': 5, 'lambda_l1': 0.788671412999049, 'lambda_l2': 0.26619018884890555}. Best is trial 1 with value: 0.8065095014462104.


Best trial: 1. Best value: 0.80651:  12%|█▎        | 5/40 [00:12<01:34,  2.70s/it]

[I 2025-10-13 21:39:55,676] Trial 4 finished with value: 0.8042095274521884 and parameters: {'learning_rate': 0.040331443383617174, 'num_leaves': 17, 'max_depth': 5, 'min_data_in_leaf': 58, 'feature_fraction': 0.7366114704534336, 'bagging_fraction': 0.848553073033381, 'bagging_freq': 1, 'lambda_l1': 0.5345166110646816, 'lambda_l2': 0.005975027999960293}. Best is trial 1 with value: 0.8065095014462104.


Best trial: 1. Best value: 0.80651:  15%|█▌        | 6/40 [00:13<01:10,  2.07s/it]

[I 2025-10-13 21:39:56,513] Trial 5 pruned. 


Best trial: 1. Best value: 0.80651:  18%|█▊        | 7/40 [00:14<00:53,  1.63s/it]

[I 2025-10-13 21:39:57,251] Trial 6 pruned. 


Best trial: 1. Best value: 0.80651:  18%|█▊        | 7/40 [00:16<01:16,  2.31s/it]

[W 2025-10-13 21:39:58,967] Trial 7 failed with parameters: {'learning_rate': 0.04547611147126364, 'num_leaves': 22, 'max_depth': 5, 'min_data_in_leaf': 25, 'feature_fraction': 0.9406590942262119, 'bagging_fraction': 0.7223651931039312, 'bagging_freq': 5, 'lambda_l1': 0.20736445177905022, 'lambda_l2': 0.0039459088110999985} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/yhashiguchi/Library/Mobile Documents/com~apple~CloudDocs/Desktop/3A/kaggle/spaceship-titanic-ut-komaba-2025/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/8x/_6ksv3c137v05nnyhyck18xw0000gn/T/ipykernel_99017/3335615857.py", line 36, in objective
    model.fit(
  File "/Users/yhashiguchi/Library/Mobile Documents/com~apple~CloudDocs/Desktop/3A/kaggle/spaceship-titanic-ut-komaba-2025/.venv/lib/python3.12/site-packages/lightgbm/sklearn.py", li




KeyboardInterrupt: 

## 家族を同じCV splitに

In [22]:
def objective(trial: optuna.Trial) -> float:
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_estimators': 600,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 1.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1.0, log=True),
        'random_state': 42,
        'metric': 'None',  # metricはfitで与える
    }

    # --- 重要: 同じ苗字の人を同じsplitに入れる ---
    # create_featuresで作った FamilyName列を使う
    groups = train_features['FamilyName'].fillna('Unknown').astype(str)

    cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    for fold_idx, (trn_idx, val_idx) in enumerate(cv.split(X_full, y_full, groups=groups), start=1):
        X_trn, X_val = X_full.iloc[trn_idx], X_full.iloc[val_idx]
        y_trn, y_val = y_full.iloc[trn_idx], y_full.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)
        callbacks = [
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0),
        ]

        model.fit(
            X_trn, y_trn,
            eval_set=[(X_val, y_val)],
            eval_metric='binary_logloss',  # LightGBM標準指標
            categorical_feature=categorical_features,
            callbacks=callbacks,
        )

        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        fold_scores.append(acc)

        trial.report(float(np.mean(fold_scores)), step=fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(fold_scores))

# --- Optuna設定 ---
sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_warmup_steps=2)
study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("\n=== Optuna 結果 ===")
print(f"Best CV accuracy: {study.best_value:.5f}")
for k, v in study.best_params.items():
    print(f"{k}: {v}")

# --- 最終モデル学習 ---
best_params = study.best_params | {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'n_estimators': 600,
    'metric': 'None',
    'random_state': 42,
}
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_full, y_full, categorical_feature=categorical_features)
print("\n最終モデル学習完了")


[I 2025-10-13 21:49:29,645] A new study created in memory with name: no-name-c5d13773-48b5-41e5-9eaf-c607291e117f
Best trial: 0. Best value: 0.802922:   2%|▎         | 1/40 [00:04<02:58,  4.57s/it]

[I 2025-10-13 21:49:34,214] Trial 0 finished with value: 0.802922091220713 and parameters: {'learning_rate': 0.02757359293934948, 'num_leaves': 49, 'max_depth': 5, 'min_data_in_leaf': 64, 'feature_fraction': 0.7468055921327309, 'bagging_fraction': 0.7467983561008608, 'bagging_freq': 1, 'lambda_l1': 0.39676050770529864, 'lambda_l2': 0.06358358856676251}. Best is trial 0 with value: 0.802922091220713.


Best trial: 1. Best value: 0.803543:   5%|▌         | 2/40 [00:06<02:04,  3.28s/it]

[I 2025-10-13 21:49:36,593] Trial 1 finished with value: 0.8035430110705348 and parameters: {'learning_rate': 0.06803900745073703, 'num_leaves': 15, 'max_depth': 6, 'min_data_in_leaf': 85, 'feature_fraction': 0.7637017332034828, 'bagging_fraction': 0.7545474901621302, 'bagging_freq': 1, 'lambda_l1': 0.008179499475211672, 'lambda_l2': 0.03752055855124281}. Best is trial 1 with value: 0.8035430110705348.


Best trial: 1. Best value: 0.803543:   8%|▊         | 3/40 [00:11<02:28,  4.02s/it]

[I 2025-10-13 21:49:41,492] Trial 2 finished with value: 0.8027114501312805 and parameters: {'learning_rate': 0.032211189352044464, 'num_leaves': 25, 'max_depth': 5, 'min_data_in_leaf': 22, 'feature_fraction': 0.7876433945605654, 'bagging_fraction': 0.8099085529881075, 'bagging_freq': 3, 'lambda_l1': 0.22673986523780396, 'lambda_l2': 0.003972110727381913}. Best is trial 1 with value: 0.8035430110705348.


Best trial: 3. Best value: 0.803773:  10%|█         | 4/40 [00:14<02:03,  3.43s/it]

[I 2025-10-13 21:49:44,026] Trial 3 finished with value: 0.8037732106702398 and parameters: {'learning_rate': 0.04025192252635066, 'num_leaves': 36, 'max_depth': 3, 'min_data_in_leaf': 65, 'feature_fraction': 0.7511572371061874, 'bagging_fraction': 0.7195154778955838, 'bagging_freq': 5, 'lambda_l1': 0.788671412999049, 'lambda_l2': 0.26619018884890555}. Best is trial 3 with value: 0.8037732106702398.


Best trial: 3. Best value: 0.803773:  12%|█▎        | 5/40 [00:19<02:23,  4.11s/it]

[I 2025-10-13 21:49:49,348] Trial 4 finished with value: 0.8033341657659279 and parameters: {'learning_rate': 0.022816739880816207, 'num_leaves': 18, 'max_depth': 5, 'min_data_in_leaf': 50, 'feature_fraction': 0.7366114704534336, 'bagging_fraction': 0.848553073033381, 'bagging_freq': 1, 'lambda_l1': 0.5345166110646816, 'lambda_l2': 0.005975027999960293}. Best is trial 3 with value: 0.8037732106702398.


Best trial: 3. Best value: 0.803773:  15%|█▌        | 6/40 [00:20<01:46,  3.12s/it]

[I 2025-10-13 21:49:50,538] Trial 5 pruned. 


Best trial: 6. Best value: 0.805594:  18%|█▊        | 7/40 [00:23<01:37,  2.96s/it]

[I 2025-10-13 21:49:53,175] Trial 6 finished with value: 0.8055943350334417 and parameters: {'learning_rate': 0.05048762470240496, 'num_leaves': 48, 'max_depth': 3, 'min_data_in_leaf': 27, 'feature_fraction': 0.7135681866731614, 'bagging_fraction': 0.7975990992289793, 'bagging_freq': 2, 'lambda_l1': 0.006516990611177174, 'lambda_l2': 0.3063462210622081}. Best is trial 6 with value: 0.8055943350334417.


Best trial: 6. Best value: 0.805594:  20%|██        | 8/40 [00:25<01:26,  2.71s/it]

[I 2025-10-13 21:49:55,343] Trial 7 pruned. 


Best trial: 6. Best value: 0.805594:  22%|██▎       | 9/40 [00:32<02:04,  4.02s/it]

[I 2025-10-13 21:50:02,257] Trial 8 finished with value: 0.8021001339869948 and parameters: {'learning_rate': 0.010150665434429315, 'num_leaves': 44, 'max_depth': 5, 'min_data_in_leaf': 76, 'feature_fraction': 0.9313811040057837, 'bagging_fraction': 0.7222133955202271, 'bagging_freq': 2, 'lambda_l1': 0.0022264204303769665, 'lambda_l2': 0.3884277754703141}. Best is trial 6 with value: 0.8055943350334417.


Best trial: 9. Best value: 0.805607:  25%|██▌       | 10/40 [00:35<01:48,  3.61s/it]

[I 2025-10-13 21:50:04,957] Trial 9 finished with value: 0.8056073397935204 and parameters: {'learning_rate': 0.054082340576877566, 'num_leaves': 26, 'max_depth': 3, 'min_data_in_leaf': 38, 'feature_fraction': 0.7975549966080241, 'bagging_fraction': 0.9188818535014192, 'bagging_freq': 4, 'lambda_l1': 0.4588156549160971, 'lambda_l2': 0.026100256506134758}. Best is trial 9 with value: 0.8056073397935204.


Best trial: 9. Best value: 0.805607:  28%|██▊       | 11/40 [00:35<01:18,  2.71s/it]

[I 2025-10-13 21:50:05,603] Trial 10 pruned. 


Best trial: 9. Best value: 0.805607:  30%|███       | 12/40 [00:37<01:09,  2.49s/it]

[I 2025-10-13 21:50:07,585] Trial 11 finished with value: 0.8042605143119669 and parameters: {'learning_rate': 0.07386391228000022, 'num_leaves': 41, 'max_depth': 3, 'min_data_in_leaf': 36, 'feature_fraction': 0.8303068815269233, 'bagging_fraction': 0.9190254559573594, 'bagging_freq': 3, 'lambda_l1': 0.019113354590616607, 'lambda_l2': 0.11493312137925893}. Best is trial 9 with value: 0.8056073397935204.


Best trial: 9. Best value: 0.805607:  32%|███▎      | 13/40 [00:38<00:53,  1.99s/it]

[I 2025-10-13 21:50:08,419] Trial 12 pruned. 


Best trial: 9. Best value: 0.805607:  35%|███▌      | 14/40 [00:40<00:47,  1.82s/it]

[I 2025-10-13 21:50:09,872] Trial 13 pruned. 


Best trial: 9. Best value: 0.805607:  38%|███▊      | 15/40 [00:45<01:08,  2.75s/it]

[I 2025-10-13 21:50:14,782] Trial 14 finished with value: 0.8008165594833541 and parameters: {'learning_rate': 0.015319976548014166, 'num_leaves': 20, 'max_depth': 4, 'min_data_in_leaf': 35, 'feature_fraction': 0.8856055643084374, 'bagging_fraction': 0.8953683793126898, 'bagging_freq': 2, 'lambda_l1': 0.03889950083344508, 'lambda_l2': 0.12414437696058499}. Best is trial 9 with value: 0.8056073397935204.


Best trial: 9. Best value: 0.805607:  40%|████      | 16/40 [00:46<00:57,  2.40s/it]

[I 2025-10-13 21:50:16,358] Trial 15 finished with value: 0.8034339306417999 and parameters: {'learning_rate': 0.09223332771281577, 'num_leaves': 32, 'max_depth': 3, 'min_data_in_leaf': 27, 'feature_fraction': 0.7086245219437244, 'bagging_fraction': 0.7962731298507341, 'bagging_freq': 3, 'lambda_l1': 0.1027153683243511, 'lambda_l2': 0.02173126507603629}. Best is trial 9 with value: 0.8056073397935204.


Best trial: 9. Best value: 0.805607:  42%|████▎     | 17/40 [00:49<00:54,  2.37s/it]

[I 2025-10-13 21:50:18,674] Trial 16 finished with value: 0.8044160652712453 and parameters: {'learning_rate': 0.04809832061065332, 'num_leaves': 42, 'max_depth': 3, 'min_data_in_leaf': 10, 'feature_fraction': 0.7973861592328798, 'bagging_fraction': 0.8548085916053562, 'bagging_freq': 4, 'lambda_l1': 0.014665861669762486, 'lambda_l2': 0.011889698841945623}. Best is trial 9 with value: 0.8056073397935204.


Best trial: 9. Best value: 0.805607:  45%|████▌     | 18/40 [00:50<00:44,  2.03s/it]

[I 2025-10-13 21:50:19,896] Trial 17 pruned. 


Best trial: 9. Best value: 0.805607:  48%|████▊     | 19/40 [00:53<00:48,  2.33s/it]

[I 2025-10-13 21:50:22,918] Trial 18 pruned. 


Best trial: 9. Best value: 0.805607:  50%|█████     | 20/40 [00:54<00:41,  2.07s/it]

[I 2025-10-13 21:50:24,402] Trial 19 finished with value: 0.8033114120244509 and parameters: {'learning_rate': 0.08668226503049993, 'num_leaves': 30, 'max_depth': 3, 'min_data_in_leaf': 49, 'feature_fraction': 0.8717147818626443, 'bagging_fraction': 0.7819754120559195, 'bagging_freq': 5, 'lambda_l1': 0.0032265959783560057, 'lambda_l2': 0.2205549639866276}. Best is trial 9 with value: 0.8056073397935204.


Best trial: 9. Best value: 0.805607:  52%|█████▎    | 21/40 [00:58<00:48,  2.54s/it]

[I 2025-10-13 21:50:28,032] Trial 20 finished with value: 0.8017912971152421 and parameters: {'learning_rate': 0.03329271358198644, 'num_leaves': 44, 'max_depth': 4, 'min_data_in_leaf': 19, 'feature_fraction': 0.9910387319984271, 'bagging_fraction': 0.8837228150703471, 'bagging_freq': 4, 'lambda_l1': 0.011203787895644978, 'lambda_l2': 0.9583592856792372}. Best is trial 9 with value: 0.8056073397935204.


Best trial: 21. Best value: 0.806375:  55%|█████▌    | 22/40 [01:00<00:44,  2.47s/it]

[I 2025-10-13 21:50:30,333] Trial 21 finished with value: 0.8063753479314061 and parameters: {'learning_rate': 0.04751119756724808, 'num_leaves': 41, 'max_depth': 3, 'min_data_in_leaf': 12, 'feature_fraction': 0.7870889818820944, 'bagging_fraction': 0.8593516788448047, 'bagging_freq': 4, 'lambda_l1': 0.01791397438191837, 'lambda_l2': 0.009864085574130065}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  57%|█████▊    | 23/40 [01:02<00:40,  2.38s/it]

[I 2025-10-13 21:50:32,511] Trial 22 finished with value: 0.8040563060681766 and parameters: {'learning_rate': 0.052584106327834654, 'num_leaves': 47, 'max_depth': 3, 'min_data_in_leaf': 33, 'feature_fraction': 0.7793410912139301, 'bagging_fraction': 0.8300368057733354, 'bagging_freq': 4, 'lambda_l1': 0.07309222981501624, 'lambda_l2': 0.00925566895500588}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  60%|██████    | 24/40 [01:05<00:40,  2.55s/it]

[I 2025-10-13 21:50:35,466] Trial 23 finished with value: 0.805933386809321 and parameters: {'learning_rate': 0.03702308228493797, 'num_leaves': 39, 'max_depth': 3, 'min_data_in_leaf': 17, 'feature_fraction': 0.7298252510895001, 'bagging_fraction': 0.86531286258243, 'bagging_freq': 5, 'lambda_l1': 0.02389147819017217, 'lambda_l2': 0.020611527045667707}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  62%|██████▎   | 25/40 [01:09<00:41,  2.74s/it]

[I 2025-10-13 21:50:38,649] Trial 24 finished with value: 0.8035849713091595 and parameters: {'learning_rate': 0.03496549718230973, 'num_leaves': 38, 'max_depth': 4, 'min_data_in_leaf': 13, 'feature_fraction': 0.8085396808252268, 'bagging_fraction': 0.8766932829139705, 'bagging_freq': 5, 'lambda_l1': 0.02289142215572228, 'lambda_l2': 0.02395902038660033}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  65%|██████▌   | 26/40 [01:11<00:35,  2.54s/it]

[I 2025-10-13 21:50:40,701] Trial 25 finished with value: 0.8037441162371405 and parameters: {'learning_rate': 0.07414305211662674, 'num_leaves': 39, 'max_depth': 3, 'min_data_in_leaf': 18, 'feature_fraction': 0.7285513796266709, 'bagging_fraction': 0.9201959509604561, 'bagging_freq': 5, 'lambda_l1': 0.14762176757375936, 'lambda_l2': 0.0013245648017072417}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  68%|██████▊   | 27/40 [01:13<00:34,  2.64s/it]

[I 2025-10-13 21:50:43,586] Trial 26 finished with value: 0.8019325023211294 and parameters: {'learning_rate': 0.040800694887141706, 'num_leaves': 33, 'max_depth': 3, 'min_data_in_leaf': 41, 'feature_fraction': 0.7744530569040902, 'bagging_fraction': 0.9811240467385637, 'bagging_freq': 4, 'lambda_l1': 0.047840217484411515, 'lambda_l2': 0.051499038355251145}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  70%|███████   | 28/40 [01:18<00:39,  3.28s/it]

[I 2025-10-13 21:50:48,362] Trial 27 finished with value: 0.8017918028199474 and parameters: {'learning_rate': 0.01883547224615489, 'num_leaves': 39, 'max_depth': 4, 'min_data_in_leaf': 17, 'feature_fraction': 0.8270669004052973, 'bagging_fraction': 0.9210401543751481, 'bagging_freq': 5, 'lambda_l1': 0.02228973638630168, 'lambda_l2': 0.0025441839777831307}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  72%|███████▎  | 29/40 [01:21<00:35,  3.25s/it]

[I 2025-10-13 21:50:51,550] Trial 28 finished with value: 0.8036501536373162 and parameters: {'learning_rate': 0.028696325135122575, 'num_leaves': 28, 'max_depth': 3, 'min_data_in_leaf': 26, 'feature_fraction': 0.7295488310188565, 'bagging_fraction': 0.8713052494321812, 'bagging_freq': 5, 'lambda_l1': 0.012424654143450056, 'lambda_l2': 0.017198621543605148}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  75%|███████▌  | 30/40 [01:26<00:36,  3.65s/it]

[I 2025-10-13 21:50:56,111] Trial 29 finished with value: 0.803554333067743 and parameters: {'learning_rate': 0.025041047191414822, 'num_leaves': 45, 'max_depth': 4, 'min_data_in_leaf': 40, 'feature_fraction': 0.7654636617294929, 'bagging_fraction': 0.945098386130247, 'bagging_freq': 4, 'lambda_l1': 0.33076181664699483, 'lambda_l2': 0.007127733712024773}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  78%|███████▊  | 31/40 [01:27<00:25,  2.82s/it]

[I 2025-10-13 21:50:57,003] Trial 30 pruned. 


Best trial: 21. Best value: 0.806375:  80%|████████  | 32/40 [01:29<00:21,  2.72s/it]

[I 2025-10-13 21:50:59,476] Trial 31 finished with value: 0.8041203003710781 and parameters: {'learning_rate': 0.0489107320241931, 'num_leaves': 48, 'max_depth': 3, 'min_data_in_leaf': 29, 'feature_fraction': 0.7170168382856984, 'bagging_fraction': 0.7628809593142369, 'bagging_freq': 3, 'lambda_l1': 0.008006186458684514, 'lambda_l2': 0.01562801722660799}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  82%|████████▎ | 33/40 [01:32<00:19,  2.73s/it]

[I 2025-10-13 21:51:02,254] Trial 32 finished with value: 0.8026466233957846 and parameters: {'learning_rate': 0.03974517461647856, 'num_leaves': 42, 'max_depth': 3, 'min_data_in_leaf': 24, 'feature_fraction': 0.747965387855106, 'bagging_fraction': 0.8295351707079895, 'bagging_freq': 1, 'lambda_l1': 0.007577992391921935, 'lambda_l2': 0.0356617484295197}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  85%|████████▌ | 34/40 [01:35<00:16,  2.72s/it]

[I 2025-10-13 21:51:04,941] Trial 33 finished with value: 0.8045512599025348 and parameters: {'learning_rate': 0.05284879181938825, 'num_leaves': 46, 'max_depth': 3, 'min_data_in_leaf': 16, 'feature_fraction': 0.7242668230044698, 'bagging_fraction': 0.8648000688980944, 'bagging_freq': 2, 'lambda_l1': 0.031173173379080123, 'lambda_l2': 0.025241080774209314}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  88%|████████▊ | 35/40 [01:38<00:13,  2.79s/it]

[I 2025-10-13 21:51:07,898] Trial 34 finished with value: 0.8049350862703477 and parameters: {'learning_rate': 0.031801485300022185, 'num_leaves': 50, 'max_depth': 3, 'min_data_in_leaf': 32, 'feature_fraction': 0.7648999388558781, 'bagging_fraction': 0.828015512188779, 'bagging_freq': 4, 'lambda_l1': 0.002675014824164907, 'lambda_l2': 0.044761805183418296}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  90%|█████████ | 36/40 [01:40<00:10,  2.53s/it]

[I 2025-10-13 21:51:09,823] Trial 35 finished with value: 0.8039937968539645 and parameters: {'learning_rate': 0.06757685214018348, 'num_leaves': 40, 'max_depth': 3, 'min_data_in_leaf': 10, 'feature_fraction': 0.7431090206386868, 'bagging_fraction': 0.773396701155983, 'bagging_freq': 5, 'lambda_l1': 0.0050525236612632895, 'lambda_l2': 0.006661183805049071}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  92%|█████████▎| 37/40 [01:42<00:07,  2.37s/it]

[I 2025-10-13 21:51:11,811] Trial 36 finished with value: 0.8039975920383039 and parameters: {'learning_rate': 0.0875540572704588, 'num_leaves': 16, 'max_depth': 6, 'min_data_in_leaf': 21, 'feature_fraction': 0.7019607945105474, 'bagging_fraction': 0.8040115810813915, 'bagging_freq': 1, 'lambda_l1': 0.01714591962936709, 'lambda_l2': 0.08438486351021868}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  95%|█████████▌| 38/40 [01:45<00:05,  2.55s/it]

[I 2025-10-13 21:51:14,783] Trial 37 finished with value: 0.8045717594119767 and parameters: {'learning_rate': 0.036854677424362714, 'num_leaves': 23, 'max_depth': 4, 'min_data_in_leaf': 58, 'feature_fraction': 0.7904264012703551, 'bagging_fraction': 0.7407854098178079, 'bagging_freq': 3, 'lambda_l1': 0.009937477282014226, 'lambda_l2': 0.004205315535809935}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375:  98%|█████████▊| 39/40 [01:47<00:02,  2.48s/it]

[I 2025-10-13 21:51:17,098] Trial 38 finished with value: 0.8044057362907233 and parameters: {'learning_rate': 0.045163167568468936, 'num_leaves': 36, 'max_depth': 3, 'min_data_in_leaf': 23, 'feature_fraction': 0.751274542648332, 'bagging_fraction': 0.8383862690694426, 'bagging_freq': 5, 'lambda_l1': 0.4024420022771972, 'lambda_l2': 0.18907065196835945}. Best is trial 21 with value: 0.8063753479314061.


Best trial: 21. Best value: 0.806375: 100%|██████████| 40/40 [01:50<00:00,  2.76s/it]


[I 2025-10-13 21:51:20,069] Trial 39 finished with value: 0.8039421199640089 and parameters: {'learning_rate': 0.028857483071269386, 'num_leaves': 43, 'max_depth': 3, 'min_data_in_leaf': 53, 'feature_fraction': 0.850160405382329, 'bagging_fraction': 0.856906975963815, 'bagging_freq': 4, 'lambda_l1': 0.8731417894973494, 'lambda_l2': 0.011039246558388283}. Best is trial 21 with value: 0.8063753479314061.

=== Optuna 結果 ===
Best CV accuracy: 0.80638
learning_rate: 0.04751119756724808
num_leaves: 41
max_depth: 3
min_data_in_leaf: 12
feature_fraction: 0.7870889818820944
bagging_fraction: 0.8593516788448047
bagging_freq: 4
lambda_l1: 0.01791397438191837
lambda_l2: 0.009864085574130065

最終モデル学習完了


In [45]:

# =========================================================
# 2. 特徴量生成
# =========================================================
train_features = create_features(train)
test_features = create_features(test)

feature_cols = [
    'HomePlanet', 'CryoSleep', 'Destination',  'VIP',
    'Side', 'Num',
    'FoodCourt', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck',
    'TotalExpense', 'AgeGroup'
]
# 'Age', 'Deck', 'FamilySize', 'IsAlone', 'HasExpense',

X_full = train_features[feature_cols].copy()
X_test_full = test_features[feature_cols].copy()
y_full = train['Transported']

# カテゴリ列をcategory型に変換（NaNも保持）
categorical_features = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP',
    'Side', 'AgeGroup'
]

for col in categorical_features:
    X_full[col] = X_full[col].astype('category')
    X_test_full[col] = X_test_full[col].astype('category')

print(f"使用特徴量数: {len(feature_cols)}")
print(f"特徴量一覧: {feature_cols}")
print(f"データシェイプ: Train {X_full.shape}, Test {X_test_full.shape}")

# =========================================================
# 3. LightGBM設定 + 5分割CV
# =========================================================
params_full = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_estimators': 600,
        'learning_rate': 0.04751119756724808,
        'num_leaves': 41,
        'max_depth': 3,
        'min_data_in_leaf': 12,
        'feature_fraction': 0.7870889818820944,
        'bagging_fraction': 0.8593516788448047,
        'bagging_freq': 4,
        'lambda_l1': 0.01791397438191837,
        'lambda_l2': 0.009864085574130065,
        'random_state': 42,
        'metric': 'None'
    }

cv_full = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_full = []
feature_importance_list = []

# --- 重要: 同じ苗字の人を同じsplitに入れる ---
# create_featuresで作った FamilyName列を使う
groups = train_features['FamilyName'].fillna('Unknown').astype(str)

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=420)
fold_scores = []

print("\n=== 5-Fold Cross Validation (欠損値未補完版) ===")
for fold_idx, (trn_idx, val_idx) in enumerate(cv.split(X_full, y_full, groups=groups), start=1):
    X_train_full, X_valid_full = X_full.iloc[trn_idx], X_full.iloc[val_idx]
    y_train_full, y_valid_full = y_full.iloc[trn_idx], y_full.iloc[val_idx]

    model = lgb.LGBMClassifier(**params_full)
    model.fit(X_train_full, y_train_full)

    y_pred_full = model.predict(X_valid_full)
    acc_full = accuracy_score(y_valid_full, y_pred_full)
    scores_full.append(acc_full)

    feature_importance_list.append(model.feature_importances_)

    print(f"Fold {fold_idx} accuracy: {acc_full:.5f}")

# =========================================================
# 4. 結果まとめ
# =========================================================
print("=" * 50)
print(f"Mean CV accuracy (欠損未補完): {np.mean(scores_full):.5f}")
print(f"Standard deviation: {np.std(scores_full):.5f}")

feature_importance_avg = np.mean(feature_importance_list, axis=0)
feature_importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': feature_importance_avg
}).sort_values('importance', ascending=False)

print("\n=== Top 10 重要特徴量 ===")
print(feature_importance_df.head(10).to_string(index=False))

# =========================================================
# 5. 全データで再学習
# =========================================================
print("\n=== 全データで最終モデルを学習 ===")
final_model = lgb.LGBMClassifier(**params_full)
final_model.fit(X_full, y_full)
print("最終モデル学習完了")


使用特徴量数: 13
特徴量一覧: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side', 'Num', 'FoodCourt', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalExpense', 'AgeGroup']
データシェイプ: Train (8693, 13), Test (4277, 13)

=== 5-Fold Cross Validation (欠損値未補完版) ===
Fold 1 accuracy: 0.81371
Fold 1 accuracy: 0.81371
Fold 2 accuracy: 0.80496
Fold 2 accuracy: 0.80496
Fold 3 accuracy: 0.82654
Fold 3 accuracy: 0.82654
Fold 4 accuracy: 0.79461
Fold 4 accuracy: 0.79461
Fold 5 accuracy: 0.78620
Mean CV accuracy (欠損未補完): 0.80521
Standard deviation: 0.01415

=== Top 10 重要特徴量 ===
     feature  importance
         Num       728.4
      VRDeck       452.6
         Spa       449.2
   FoodCourt       418.4
TotalExpense       413.6
 RoomService       365.8
ShoppingMall       333.2
  HomePlanet       228.0
    AgeGroup       138.4
        Side       117.2

=== 全データで最終モデルを学習 ===
Fold 5 accuracy: 0.78620
Mean CV accuracy (欠損未補完): 0.80521
Standard deviation: 0.01415

=== Top 10 重要特徴量 ===
     feature  importance
 

In [46]:
# 全データで学習した場合。出力まで行う

final_model = lgb.LGBMClassifier(**params_full)
final_model.fit(X_full, y_full)

preds = final_model.predict(X_test_full)


passenger_ids = test["PassengerId"]

# DataFrame にまとめる
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": preds
})

# CSV 出力（インデックスは不要）
submission.to_csv("submission.csv", index=False)

In [47]:
submission['Transported'].value_counts()

Transported
True     2223
False    2054
Name: count, dtype: int64