In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold
import catboost as cb
import numpy as np
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

# Load datasets
train = pd.read_csv("../dataset/train.csv")
test = pd.read_csv("../dataset/test.csv")

# =========================================================
# 1. 特徴量エンジニアリング
# =========================================================
def create_features(df: pd.DataFrame) -> pd.DataFrame:
    """欠損値を補完せずに特徴量を生成する"""
    data = df.copy()
    
    # Cabin情報を分解（欠損はNaNのまま保持）
    if 'Cabin' in data.columns:
        cabin_split = data['Cabin'].str.split('/', expand=True)
        data['Deck'] = cabin_split[0]
        data['Num'] = pd.to_numeric(cabin_split[1], errors='coerce')
        data['Side'] = cabin_split[2]
    else:
        data['Deck'] = np.nan
        data['Num'] = np.nan
        data['Side'] = np.nan

    # 支出関連特徴量（NaNはそのまま残す）
    expense_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    two_expense = ['FoodCourt', 'ShoppingMall']
    three_expense = ['RoomService', 'Spa', 'VRDeck']
    for col in expense_cols:
        if col not in data.columns:
            data[col] = np.nan

    # 総支出額（NaNのまま計算 → NaNを含む場合はNaN）
    data['TotalExpense'] = data[expense_cols].sum(axis=1, skipna=False)
    data['TwoExpense'] = data[two_expense].sum(axis=1, skipna=False)
    data['ThreeExpense'] = data[three_expense].sum(axis=1, skipna=False)
    data['HasExpense'] = data[expense_cols].gt(0).any(axis=1).astype(int)

    # 年齢グループ（少し改良）
    bins = [0, 7, 14, 21, 50, 100]
    # bins = [0, 7, 14, 21, 50, 100]
    labels = ['Child', 'Teen', 'Young', 'Adult', 'Senior']
    data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels)

    # Name情報から家族サイズを生成（欠損はNaN）
    if 'Name' in data.columns:
        data['FamilyName'] = data['Name'].str.split().str[-1]
        family_counts = data['FamilyName'].value_counts()
        data['FamilySize'] = data['FamilyName'].map(family_counts)
        data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    # PassengerId情報（欠損時はNaN）
    if 'PassengerId' in data.columns:
        passenger_split = data['PassengerId'].str.split('_', expand=True)
        data['PassengerGroup'] = passenger_split[0]
        data['PassengerNum'] = pd.to_numeric(passenger_split[1], errors='coerce')
        
    # 組み合わせ特徴量の追加
    num_bins = [0, 250, 500, 750, 2000]
    num_labels = ['Q1', 'Q2', 'Q3', 'Q4']
    data['NumQuartile'] = pd.cut(data['Num'], bins=num_bins, labels=num_labels, right=False)
    data['Cabin_Loc'] = data['Deck'].astype(str) + '_' + data['NumQuartile'].astype(str)
    data['NoExpense_Cryo'] = ((data['CryoSleep'] == True) & (data['TotalExpense'] == 0)).astype(object)
    data['Cryo_VIP'] = data['CryoSleep'].astype(str) + '_' + data['VIP'].astype(str)
    data['Planet_AgeGroup'] = data['HomePlanet'].astype(str) + '_' + data['AgeGroup'].astype(str)
    data['Deck_Side'] = data['Deck'].astype(str) + '_' + data['Side'].astype(str)
    data['Full_Travel_Combo'] = (data['HomePlanet'].astype(str) + '_' + 
                                 data['Destination'].astype(str) + '_' + 
                                 data['CryoSleep'].astype(str))
    if 'FamilyName' in data.columns:
        data['Family_Group'] = data['FamilyName'].astype(str) + '_' + data['PassengerGroup'].astype(str)
    else:
        data['Family_Group'] = np.nan
    return data

# =========================================================
# 2. 特徴量生成とデータ準備
# =========================================================
train_features = create_features(train)
test_features = create_features(test)

feature_cols = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP',
    'Side', 'Num',
    'TwoExpense', 'ThreeExpense', 'AgeGroup',
    'Cabin_Loc', 'NoExpense_Cryo', 'Cryo_VIP', 'Planet_AgeGroup',
    'Deck_Side', 'Full_Travel_Combo',
]

X_full = train_features[feature_cols].copy()
X_test_full = test_features[feature_cols].copy()
y_full = train['Transported']

categorical_features = [
    'HomePlanet', 'CryoSleep', 'Destination', 'VIP',
    'Side', 'AgeGroup',
    'Cabin_Loc', 'NoExpense_Cryo', 'Cryo_VIP', 'Planet_AgeGroup',
    'Deck_Side', 'Full_Travel_Combo',
]

for col in categorical_features:
    X_full[col] = X_full[col].astype(str).replace('nan', 'NA').replace('None', 'NA')
    X_test_full[col] = X_test_full[col].astype(str).replace('nan', 'NA').replace('None', 'NA')

groups = train_features['PassengerGroup'].fillna('Unknown').astype(str)

print(f"使用特徴量数: {len(feature_cols)}")
print(f"特徴量一覧: {feature_cols}")
print(f"データシェイプ: Train {X_full.shape}, Test {X_test_full.shape}")

# =========================================================
# 3. Optunaの最良パラメータを適用
# =========================================================

# ★ Optunaの最良パラメータを直接使用
best_params_optuna = {
    'iterations': 183,
    'learning_rate': 0.08479043292417315,
    'depth': 3,
    'l2_leaf_reg': 0.00010707258143447216,
    'border_count': 253,
}

params_cat = {
    'loss_function': 'Logloss',
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'verbose': 0,
    'allow_writing_files': False,
}
params_cat.update(best_params_optuna)

# =========================================================
# 4. 5分割CVによる最適なイテレーション数の再決定
# =========================================================
scores_full = []
feature_importance_list = []
best_iterations_list = [] 

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=420)

print("\n=== 5-Fold Cross Validation (Optunaパラメータ + Early Stopping) ===")
for fold_idx, (trn_idx, val_idx) in enumerate(cv.split(X_full, y_full, groups=groups), start=1):
    X_train_full, X_valid_full = X_full.iloc[trn_idx], X_full.iloc[val_idx]
    y_train_full, y_valid_full = y_full.iloc[trn_idx], y_full.iloc[val_idx]

    # Initialize model with Optuna's best params
    model = cb.CatBoostClassifier(**params_cat)
    model.fit(
        X_train_full, y_train_full,
        cat_features=categorical_features,
        eval_set=(X_valid_full, y_valid_full),
        early_stopping_rounds=50, 
    )
    
    best_iterations_list.append(model.get_best_iteration())

    y_pred_full = model.predict(X_valid_full)
    acc_full = accuracy_score(y_valid_full, y_pred_full)
    scores_full.append(acc_full)

    feature_importance_list.append(model.get_feature_importance())

    print(f"Fold {fold_idx} accuracy: {acc_full:.5f} (Best Iter: {model.get_best_iteration()})")

# =========================================================
# 5. 結果まとめと最終モデル学習
# =========================================================
mean_best_iterations = int(np.mean(best_iterations_list))

print("=" * 50)
print(f"Mean CV accuracy (CatBoost, Optuna Params): {np.mean(scores_full):.5f}")
print(f"Standard deviation: {np.std(scores_full):.5f}")
print(f"Mean Best Iterations (for final model): {mean_best_iterations}")
print("=" * 50)


feature_importance_avg = np.mean(feature_importance_list, axis=0)
feature_importance_df = pd.DataFrame({
    'feature': X_full.columns,
    'importance': feature_importance_avg
}).sort_values('importance', ascending=False)

print("\n=== Top 10 重要特徴量 (CatBoost) ===")
print(feature_importance_df.head(10).to_string(index=False))

# =========================================================
# 6. 全データで再学習 (Best Iterationsを適用)
# =========================================================
params_final = params_cat.copy()
# CVで得られた平均最適なイテレーション数に上書き
params_final['iterations'] = mean_best_iterations

print(f"\n=== 全データで最終モデルを学習 (CatBoost, Iterations={mean_best_iterations}, Optuna Params) ===")
final_model = cb.CatBoostClassifier(**params_final)
final_model.fit(X_full, y_full, cat_features=categorical_features)
print("最終モデル学習完了")

# =========================================================
# 7. 提出ファイルの作成
# =========================================================
preds = final_model.predict(X_test_full)

passenger_ids = test["PassengerId"]

output = pd.DataFrame({'PassengerId': passenger_ids,
                       'Transported': preds.astype(bool)})

output.to_csv('submission.csv', index=False)
print("\n提出ファイルの先頭5行:")
print(output.head())

使用特徴量数: 15
特徴量一覧: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side', 'Num', 'TwoExpense', 'ThreeExpense', 'AgeGroup', 'Cabin_Loc', 'NoExpense_Cryo', 'Cryo_VIP', 'Planet_AgeGroup', 'Deck_Side', 'Full_Travel_Combo']
データシェイプ: Train (8693, 15), Test (4277, 15)

=== 5-Fold Cross Validation (Optunaパラメータ + Early Stopping) ===
Fold 1 accuracy: 0.81782 (Best Iter: 137)
Fold 2 accuracy: 0.78716 (Best Iter: 48)
Fold 3 accuracy: 0.79486 (Best Iter: 122)
Fold 4 accuracy: 0.82619 (Best Iter: 178)
Fold 5 accuracy: 0.80664 (Best Iter: 141)
Mean CV accuracy (CatBoost, Optuna Params): 0.80653
Standard deviation: 0.01432
Mean Best Iterations (for final model): 125

=== Top 10 重要特徴量 (CatBoost) ===
          feature  importance
     ThreeExpense   40.634235
       HomePlanet   11.472475
       TwoExpense    9.275708
        CryoSleep    9.154364
Full_Travel_Combo    6.138151
        Cabin_Loc    5.874760
              Num    5.051525
        Deck_Side    4.460456
             Side    3.958964
  Plan