In [19]:
# 導入必要的庫
import pandas as pd
from pycaret.classification import *

# 再次嘗試不同的特徵工程函數
def transform_features(df):
    # 處理缺失值
    df['Age'] = df['Age'].fillna(-1)  # 缺失的年齡填充為 -1（特殊標記）
    df['Fare'] = df['Fare'].fillna(0)  # 缺失的票價填充為 0
    df['Embarked'] = df['Embarked'].fillna('U')  # 缺失的登船港口填充為 'U'

    # 創建新特徵
    df['Child'] = (df['Age'] < 18).astype(int)  # 是否為未成年人
    df['Senior'] = (df['Age'] > 60).astype(int)  # 是否為老年人
    df['LargeFamily'] = ((df['SibSp'] + df['Parch']) >= 4).astype(int)  # 是否為大家庭
    df['FareBucket'] = pd.qcut(df['Fare'], 4, labels=[1, 2, 3, 4])  # 將票價分為四個區間

    # 簡化艙位特徵
    df['Cabin'] = df['Cabin'].fillna('U')
    df['CabinType'] = df['Cabin'].str[0]  # 提取艙位的首字母

    # 將名字中的標題提取為一個新特徵
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace(['Mme', 'Countess', 'Lady', 'Dona'], 'Mrs')
    df['Title'] = df['Title'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev', 'Sir', 'Jonkheer', 'Don'], 'Rare')

    # 處理性別
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})  # 將性別映射為數字

    # 移除不必要的特徵
    df = df.drop(['Ticket', 'Name', 'Cabin'], axis=1)

    return df

# 讀取 Titanic 數據集
train_data = pd.read_csv('train.csv')  # 替換為您的訓練數據文件路徑
test_data = pd.read_csv('test.csv')    # 替換為您的測試數據文件路徑

# 應用新的特徵工程
data_train = transform_features(train_data)
data_test = transform_features(test_data)

# 初始化 PyCaret 環境
clf = setup(
    data=data_train,
    target='Survived',
    session_id=42, 
    normalize=True, 
    categorical_features=['Embarked', 'Title', 'CabinType', 'FareBucket'],
    ignore_features=['PassengerId']
)

# 比較所有可用的分類模型
best_model = compare_models(n_select=1)

# 超參數調優（可選）
tuned_model = tune_model(best_model)

# 選擇最佳模型並進行訓練
final_model = finalize_model(tuned_model)

# 測試集預測
predictions = predict_model(final_model, data=data_test)

# Kaggle 提交格式
submission = test_data[['PassengerId']].copy()  # 保留 PassengerId
submission['Survived'] = predictions['prediction_label'].astype(int)  # 使用 PyCaret 預設輸出

# 保存提交結果
submission.to_csv('submission3.csv', index=False)
print("提交格式的預測結果已保存到 'submission3.csv'")

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 15)"
4,Transformed data shape,"(891, 32)"
5,Transformed train set shape,"(623, 32)"
6,Transformed test set shape,"(268, 32)"
7,Ignore features,1
8,Numeric features,9
9,Categorical features,4


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8363,0.8686,0.7408,0.8171,0.7762,0.6478,0.6505,0.029
ridge,Ridge Classifier,0.8282,0.8569,0.7368,0.8019,0.7657,0.6309,0.6344,0.016
lda,Linear Discriminant Analysis,0.8266,0.8566,0.7368,0.7989,0.7641,0.6279,0.6315,0.016
lightgbm,Light Gradient Boosting Machine,0.8235,0.8587,0.7286,0.7973,0.7583,0.6203,0.6247,0.088
lr,Logistic Regression,0.8202,0.8539,0.7534,0.7751,0.7612,0.6176,0.6205,0.016
ada,Ada Boost Classifier,0.8137,0.832,0.7536,0.7625,0.7554,0.6054,0.6083,0.026
rf,Random Forest Classifier,0.8107,0.8544,0.7328,0.7717,0.7473,0.5967,0.6013,0.044
et,Extra Trees Classifier,0.8074,0.8321,0.7368,0.7619,0.7457,0.5913,0.5948,0.036
knn,K Neighbors Classifier,0.8073,0.8225,0.7201,0.7695,0.7395,0.5876,0.5927,0.014
nb,Naive Bayes,0.7815,0.8411,0.6143,0.7831,0.6746,0.5168,0.5337,0.014


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8413,0.9252,0.8333,0.7692,0.8,0.6688,0.6702
1,0.8571,0.9038,0.7917,0.8261,0.8085,0.6947,0.6951
2,0.8254,0.8803,0.75,0.7826,0.766,0.6268,0.6272
3,0.8387,0.9181,0.8261,0.76,0.7917,0.6605,0.662
4,0.8226,0.8427,0.5833,0.9333,0.7179,0.5984,0.6335
5,0.8871,0.9128,0.8333,0.8696,0.8511,0.7602,0.7607
6,0.7742,0.9106,0.625,0.75,0.6818,0.509,0.5141
7,0.7903,0.7939,0.625,0.7895,0.6977,0.5405,0.5491
8,0.871,0.8728,0.7917,0.8636,0.8261,0.7238,0.7256
9,0.8065,0.8322,0.7917,0.7308,0.76,0.5983,0.5996


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


提交格式的預測結果已保存到 'submission3.csv'
