# 数据加载与初步探索

In [1]:
!python --version

Python 3.10.19


In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgbm

In [3]:
# 加载数据
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print(train_df.head())

# 查看缺失值
print(train_df.isnull().sum())
# Age, Cabin, Embarked 存在缺失值
print(test_df.isnull().sum())
# Age, Fare, Cabin存在缺失值

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
Pa

# 特征工程

In [4]:
## 缺失值处理

### 1.数值特征：中位数插补
age_imputer = SimpleImputer(strategy='median')
train_df.loc[:,"Age"] = age_imputer.fit_transform(train_df[["Age"]])
test_df.loc[:,"Age"] = age_imputer.fit_transform(test_df[["Age"]])

### 2.类别特征Embarked：众数插补
embarked_mode = train_df["Embarked"].mode()[0]
train_df.loc[:,"Embarked"] = train_df['Embarked'].fillna(embarked_mode)
test_df.loc[:,"Embarked"] = test_df['Embarked'].fillna(embarked_mode)

### 3.类别特征cabin缺失过多，用“Unknown”标记
train_df.loc[:,"Cabin"] = train_df['Cabin'].fillna('Unknown')
test_df.loc[:,"Cabin"] = test_df['Cabin'].fillna('Unknown')

### 4. 测试集Fare缺失值: 中位数插补
fare_imputer = SimpleImputer(strategy='median')
test_df.loc[:,'Fare'] = fare_imputer.fit_transform(test_df[['Fare']])

In [5]:
## 类别特征编码

### 1.二分类特征Sex：标签编码
sex_label = LabelEncoder()
train_df.loc[:,"Sex_Encoded"] = sex_label.fit_transform(train_df[["Sex"]])
test_df.loc[:,"Sex_Encoded"] = sex_label.fit_transform(test_df[["Sex"]])

### 2.多分类特征Embarked：独热编码，drop_first避免多重共线性
train_df = pd.get_dummies(train_df, columns = ['Embarked'], drop_first=True)
test_df = pd.get_dummies(test_df, columns = ['Embarked'], drop_first=True)

### 3.Cabin特征（首字母代码客舱等级）：提取首字母
train_df.loc[:, 'Cabin_prefix'] = train_df['Cabin'].str[0]
test_df.loc[:, 'Cabin_prefix'] = test_df['Cabin'].str[0]
train_df = pd.get_dummies(train_df, columns = ['Cabin_prefix'], drop_first=True)
test_df = pd.get_dummies(test_df, columns = ['Cabin_prefix'], drop_first=True)
# 对齐训练集和测试集的特征, 避免有的值没有出现训练集和测试集的特征不匹配问题
train_df, test_df = train_df.align(test_df, join='left', axis = 1, fill_value = 0)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [6]:
## 构造特征
### 1.家庭规模：SibSp+Parch+1（自己）
train_df.loc[:, 'FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df.loc[:, 'FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

### 2.头衔提取：从Name中提取Mr/Miss/Mrs等
train_df.loc[:, 'Title'] = train_df['Name'].str.extract("([A-Za-z]+)\.", expand=False)
test_df.loc[:, 'Title'] = test_df['Name'].str.extract("([A-Za-z]+)\.", expand=False)
# print(train_df['Title'].value_counts())
# print(test_df['Title'].value_counts())

# 合并稀有头衔(频率小于10)为Rare
rare_titles = train_df['Title'].value_counts()[train_df['Title'].value_counts()<10].index
train_df.loc[train_df['Title'].isin(rare_titles), 'Title'] = 'Rare'
test_df.loc[test_df['Title'].isin(rare_titles), 'Title'] = 'Rare'

train_df = pd.get_dummies(train_df, columns=['Title'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Title'], drop_first=True)
train_df, test_df = train_df.align(test_df, join='left', axis = 1, fill_value = 0)

# 选择特征与模型训练

In [7]:
# print(train_df.columns)
features = ['Pclass', 'Age', 'Fare', 'Sex_Encoded', 'Embarked_Q', 'Embarked_S'] + [col for col in train_df.columns if 'Cabin_prefix' in col or 'Title' in col]
# 确保特征在测试集中存在
features = [f for f in features if f in test_df.columns]
# print(features)

In [8]:
X = train_df[features]
y = train_df['Survived']
X_test = test_df[features]

In [9]:
## 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=42)

In [10]:
## 选择模型 - 随机森林
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
## 交叉验证
rf_cv_score = cross_val_score(rf_model, X, y, cv=5).mean()
print(f"交叉验证准确率:{rf_cv_score:4f}")

交叉验证准确率:0.804708


In [12]:
## xgboost
xgb_model = xgb.XGBClassifier(n_estimators = 100, random_state=42)
xgb_model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [13]:
## 交叉验证
xgb_cv_score = cross_val_score(xgb_model, X, y, cv=5).mean()
print(f"交叉验证准确率:{xgb_cv_score:4f}")

交叉验证准确率:0.820413


In [14]:
## lightgbm
lgbm_model = lgbm.LGBMClassifier(n_estimators=100,random_state=42)
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 197
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [15]:
## 交叉验证
lgbm_cv_score = cross_val_score(lgbm_model, X, y, cv=5).mean()
print(f"交叉验证准确率:{lgbm_cv_score:4f}")

[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 207
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 205
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info

# 调参

In [16]:
## 网格搜索
param_grid = {
    'n_estimators': [50, 100, 200],      # 树的数量
    'max_depth': [3, 5, 7],             # 树的深度
    'learning_rate': [0.01, 0.1, 0.2],   # 学习率
    'subsample': [0.8, 1.0]              # 训练每棵树时使用的样本比例
}

grid_search = GridSearchCV(
    estimator=xgb_model, 
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy', 
    verbose=1,   # 设置为 1 可以看到搜索进度
    n_jobs=-1    # 再次强调，Mac 上用 -1 开启多核加速
)

# 4. 执行搜索
grid_search.fit(X_train, y_train)

# 5. 输出结果
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳得分: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
最佳参数: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
最佳得分: 0.8441


In [19]:
y_pred = grid_search.predict(X_test)

In [20]:
Submission_df = pd.DataFrame({"PassengerId": test_df["PassengerId"], "Survived":y_pred})
Submission_df.to_csv("titanic_submission.csv", index=False)