In [5]:
# 首先导入必要的包
# !pip install catboost
import pandas as pd
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# 加载数据集
data = pd.read_csv("./data/Titanic.txt")

# 将 Pclass 转为字符串类型（如有必要）
# 数据类型问题：如果 Pclass 列是字符串类型，而你在处理时没有正确转换，可能导致它被视为单一类别。
data['Pclass'] = data['Pclass'].astype(str)
data = data.dropna(subset=["Survived"])

# 确定特征值和目标值
X = data[['Pclass','Sex','Age','SibSp','ParCh','Fare']]
y = data['Survived']

# 填充缺失值
X['Age'].fillna(X['Age'].median(), inplace=True)
X['Fare'].fillna(X['Fare'].median(),inplace=True)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=66)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['Age'].fillna(X['Age'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Age'].fillna(X['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(val

In [6]:
# 定义超参数网格
param_grid = {
    'iterations':[500,100,1500,2000],
    'learning_rate':[0.0001,0.001,0.01,0.1],
    'depth':[3,5,7,9],
    'l2_leaf_reg':[1,3,5,7]
}

# 初始化CatBoost分类器
catboost_model = CatBoostClassifier(cat_features=['Pclass','Sex'], verbose=0)

# 创建网格搜索对象
grid_search = GridSearchCV(estimator=catboost_model,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3,
                           n_jobs=-1)

# 执行网格搜索
grid_search.fit(X_train, y_train)

# 提取结果
results = grid_search.cv_results_
results_df = pd.DataFrame(results)

# 绘制所有超参数对准确率的变化
params_to_plot = ['learning_rate', 'depth', 'l2_leaf_reg']

plt.figure(figsize=(15,10))

for i, param in enumerate(params_to_plot):
    plt.subplot(2, 2, i + 1)
    for value in results_df['params'].apply(lambda x:x[param]).unique():
        subset = results_df[results_df['params'].apply(lambda x:x[param] == value)]
        plt.plot(subset['params'].apply(lambda x:x['iterations']), subset['mean_test_score'])

    plt.title(f"{param} 对于准确率的影响")
    plt.xlabel('迭代次数')
    plt.ylabel('平均测试分数（准确率）')
    plt.xticks([500, 1000])
    plt.legend()
    plt.grid()
    
plt.tight_layout()
plt.show()


KeyboardInterrupt: 