In [1]:
# 導入必要的庫
import pandas as pd
from pycaret.classification import *

# 讀取 Titanic 數據集
train_data = pd.read_csv('train.csv')  # 替換為您的訓練數據文件路徑
test_data = pd.read_csv('test.csv')    # 替換為您的測試數據文件路徑

# 初始化 PyCaret 環境
clf = setup(
    data=train_data,
    target='Survived',
    session_id=42, 
    normalize=True, 
    categorical_features=['Sex', 'Embarked'], 
    ignore_features=['Name', 'Ticket', 'Cabin']
)

# 比較所有可用的分類模型
best_model = compare_models(n_select=1)

# 打印最佳模型名稱
print(f"最佳模型名稱: {type(best_model).__name__}")

# 選擇最佳模型，進行訓練
final_model = finalize_model(best_model)

# 將模型應用於測試集，並生成預測
predictions = predict_model(final_model, data=test_data)

# 打印預測結果的列名，幫助確認
print("預測結果的列名:", predictions.columns)

# 明確指定預測結果的列名（根據 PyCaret 的預設，通常是 'prediction_label'）
if 'prediction_label' in predictions.columns:
    prediction_column = 'prediction_label'
else:
    raise ValueError("找不到包含預測結果的列，請確認模型輸出！")

print(f"使用的預測結果列名為: {prediction_column}")

# Kaggle 提交格式
submission = test_data[['PassengerId']].copy()  # 保留 PassengerId
submission['Survived'] = predictions[prediction_column]  # 使用正確的預測列名

# 確保 Survived 列為整數（0 或 1）
submission['Survived'] = submission['Survived'].astype(int)

# 保存為符合提交格式的 CSV 文件
submission.to_csv('submission.csv', index=False)
print("提交格式的預測結果已保存到 'submission.csv'")


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 12)"
4,Transformed data shape,"(891, 11)"
5,Transformed train set shape,"(623, 11)"
6,Transformed test set shape,"(268, 11)"
7,Ignore features,3
8,Numeric features,6
9,Categorical features,2


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8218,0.8454,0.7033,0.8092,0.7501,0.6132,0.6189,0.036
gbc,Gradient Boosting Classifier,0.8202,0.8577,0.6949,0.8169,0.7463,0.6091,0.6179,0.023
knn,K Neighbors Classifier,0.8074,0.8086,0.6991,0.7822,0.7358,0.5852,0.5898,0.242
et,Extra Trees Classifier,0.8026,0.8233,0.6949,0.7706,0.7286,0.5745,0.5784,0.03
lr,Logistic Regression,0.801,0.853,0.7034,0.7651,0.7296,0.5733,0.5775,0.282
lightgbm,Light Gradient Boosting Machine,0.801,0.838,0.6866,0.7702,0.7246,0.57,0.5734,0.086
ada,Ada Boost Classifier,0.7978,0.8302,0.7076,0.7587,0.7279,0.5681,0.573,0.02
ridge,Ridge Classifier,0.7914,0.8536,0.6866,0.7503,0.7149,0.5514,0.5546,0.011
lda,Linear Discriminant Analysis,0.7914,0.8536,0.6866,0.7503,0.7149,0.5514,0.5546,0.011
nb,Naive Bayes,0.785,0.8183,0.6911,0.7384,0.7103,0.5403,0.5446,0.01


最佳模型名稱: RandomForestClassifier


預測結果的列名: Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'prediction_label',
       'prediction_score'],
      dtype='object')
使用的預測結果列名為: prediction_label
提交格式的預測結果已保存到 'submission.csv'
