# 数据加载与初步探索

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# 加载数据
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print(train_df.head())

# 查看缺失值
print(train_df.isnull().sum())
# Age, Cabin, Embarked 存在缺失值
print(test_df.isnull().sum())
# Age, Fare, Cabin存在缺失值

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
Pa

# 特征工程

In [3]:
## 缺失值处理

### 1.数值特征：中位数插补
age_imputer = SimpleImputer(strategy='median')
train_df.loc[:,"Age"] = age_imputer.fit_transform(train_df[["Age"]])
test_df.loc[:,"Age"] = age_imputer.fit_transform(test_df[["Age"]])

### 2.类别特征Embarked：众数插补
embarked_mode = train_df["Embarked"].mode()[0]
train_df.loc[:,"Embarked"] = train_df['Embarked'].fillna(embarked_mode)
test_df.loc[:,"Embarked"] = test_df['Embarked'].fillna(embarked_mode)

### 3.类别特征cabin缺失过多，用“Unknown”标记
train_df.loc[:,"Cabin"] = train_df['Cabin'].fillna('Unknown')
test_df.loc[:,"Cabin"] = test_df['Cabin'].fillna('Unknown')

### 4. 测试集Fare缺失值: 中位数插补
fare_imputer = SimpleImputer(strategy='median')
test_df.loc[:,'Fare'] = fare_imputer.fit_transform(test_df[['Fare']])

In [4]:
## 类别特征编码

### 1.二分类特征Sex：标签编码
sex_label = LabelEncoder()
train_df.loc[:,"Sex_Encoded"] = sex_label.fit_transform(train_df[["Sex"]])
test_df.loc[:,"Sex_Encoded"] = sex_label.fit_transform(test_df[["Sex"]])

### 2.多分类特征Embarked：独热编码，drop_first避免多重共线性
train_df = pd.get_dummies(train_df, columns = ['Embarked'], drop_first=True)
test_df = pd.get_dummies(test_df, columns = ['Embarked'], drop_first=True)

### 3.Cabin特征（首字母代码客舱等级）：提取首字母
train_df.loc[:, 'Cabin_prefix'] = train_df['Cabin'].str[0]
test_df.loc[:, 'Cabin_prefix'] = test_df['Cabin'].str[0]
train_df = pd.get_dummies(train_df, columns = ['Cabin_prefix'], drop_first=True)
test_df = pd.get_dummies(test_df, columns = ['Cabin_prefix'], drop_first=True)
# 对齐训练集和测试集的特征, 避免有的值没有出现训练集和测试集的特征不匹配问题
train_df, test_df = train_df.align(test_df, join='left', axis = 1, fill_value = 0)

  return f(*args, **kwargs)


In [5]:
## 构造特征
### 1.家庭规模：SibSp+Parch+1（自己）
train_df.loc[:, 'FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df.loc[:, 'FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

### 2.头衔提取：从Name中提取Mr/Miss/Mrs等
train_df.loc[:, 'Title'] = train_df['Name'].str.extract("([A-Za-z]+)\.", expand=False)
test_df.loc[:, 'Title'] = test_df['Name'].str.extract("([A-Za-z]+)\.", expand=False)
# print(train_df['Title'].value_counts())
# print(test_df['Title'].value_counts())

# 合并稀有头衔(频率小于10)为Rare
rare_titles = train_df['Title'].value_counts()[train_df['Title'].value_counts()<10].index
train_df.loc[train_df['Title'].isin(rare_titles), 'Title'] = 'Rare'
test_df.loc[test_df['Title'].isin(rare_titles), 'Title'] = 'Rare'

train_df = pd.get_dummies(train_df, columns=['Title'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Title'], drop_first=True)
train_df, test_df = train_df.align(test_df, join='left', axis = 1, fill_value = 0)

# 选择特征与模型训练

In [6]:
# print(train_df.columns)
features = ['Pclass', 'Age', 'Fare', 'Sex_Encoded', 'Embarked_Q', 'Embarked_S'] + [col for col in train_df.columns if 'Cabin_prefix' in col or 'Title' in col]
# 确保特征在测试集中存在
features = [f for f in features if f in test_df.columns]
# print(features)

In [7]:
X = train_df[features]
y = train_df['Survived']
X_test = test_df[features]

In [8]:
## 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=42)

In [9]:
## 选择模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [10]:
## 交叉验证
cv_score = cross_val_score(model, X, y, cv=5).mean()
print(f"交叉验证准确率:{cv_score:4f}")

交叉验证准确率:0.805831


In [11]:
y_pred = model.predict(X_test)

In [14]:
Submission_df = pd.DataFrame({"PassengerId": test_df["PassengerId"], "Survived":y_pred})
Submission_df.to_csv("titanic_submission.csv", index=False)