In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
#读取数据，同时把乘客ID设置为索引，以免和自带的索引混淆
train = pd.read_csv('./train.csv').set_index('PassengerId')
test = pd.read_csv('./test.csv').set_index('PassengerId')

In [3]:
#按行连接两个工作表，即上下拼接
df = pd.concat([train, test])                #默认的axis = 0可以不写

In [4]:
#新增一行提取Title信息和之前的类似
df['Title'] = df['Name'].str.split(',').str[1].str.split('.').str[0].str.strip()            

In [5]:
#接下来将女人和小孩直接定义为一项特征
df['IsWomanOrChild'] = ((df['Title'] == 'Master') | (df['Sex']== 'female'))     #Title中有Master的都是小孩

In [6]:
#提取名字中的姓氏
df['LastName'] = df.Name.str.split(',').str[0]           

从名字中可以看出小孩的年龄信息，以及各个乘客的家庭信息，后面将主要以女人小孩群体和家庭群体提取特征

接下来是一种非常巧妙的处理方法，成年男性单独处理，女人小孩以家庭为单位处理

In [7]:
#以家庭为单位
family = df.groupby(['LastName'])['Survived']
pd.DataFrame(family)

Unnamed: 0,0,1
0,Abbing,"PassengerId 846 0.0 Name: Survived, dtype: ..."
1,Abbott,PassengerId 280 1.0 747 0.0 1284 Na...
2,Abelseth,PassengerId 949 NaN 1237 NaN Name: Surviv...
3,Abelson,PassengerId 309 0.0 875 1.0 Name: Surviv...
4,Abrahamsson,"PassengerId 1143 NaN Name: Survived, dtype: ..."
...,...,...
870,de Mulder,"PassengerId 287 1.0 Name: Survived, dtype: ..."
871,de Pelsmaeker,"PassengerId 283 0.0 Name: Survived, dtype: ..."
872,del Carlo,PassengerId 362 0.0 907 NaN Name: Surviv...
873,van Billiard,PassengerId 154 0.0 1084 NaN 1236 Na...


In [8]:
#女人小孩的生存情况统计出自己以外的，对于生存状况位置的以0填充
df['FamilyTotalCount'] = family.transform(lambda s: s[df['IsWomanOrChild']].fillna(0).count())  #s[df['IsWomanOrChild']]表示筛选是否为女人或者小孩
df['FamilyTotalCount'] = df.mask(df['IsWomanOrChild'], df['FamilyTotalCount'] - 1, axis=0)    #如果是女人或者小孩，家庭人数减去1，如果是男性，保持原来的数值survied

In [9]:
#女人小孩的家庭总人数只统计女人小孩的，位置生存状况的按照0来算；成年男性的按照survived的情况来，也就是不变

df['FamilySurvivedCount'] = family.transform(lambda s: s[df['IsWomanOrChild']].fillna(0).sum())   
df['FamilySurvivedCount'] = df.mask(df['IsWomanOrChild'], df['FamilySurvivedCount'] - df['Survived'].fillna(0), axis=0)     
df['FamilySurvivalRate'] = (df['FamilySurvivedCount'] / df['FamilyTotalCount'].replace(0, np.nan))    #这里replace主要针对的是死亡的成年男性，因为其家庭总人数为0


In [10]:
#df['FamilyTotalCount'] == 0也就是所有死亡的男性，df['FamilyTotalCount'] == 1则是所有活下来的男性
df['IsSingleTraveler'] = df['FamilyTotalCount'] == 0

In [11]:
#构建特征集
feature = pd.concat(
    [df['FamilySurvivalRate'].fillna(0),
     df['IsSingleTraveler'],
     df['Sex'].replace({'male': 0, 'female': 1})],axis = 1)

In [12]:
#构造训练集和测试集
x_train, x_test = feature.loc[train.index],  feature.loc[test.index]
x_train

Unnamed: 0_level_0,FamilySurvivalRate,IsSingleTraveler,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,True,0
2,0.0,True,1
3,0.0,True,1
4,0.0,True,1
5,0.0,True,0
...,...,...,...
887,0.0,True,0
888,1.0,False,1
889,0.0,False,1
890,1.0,False,0


In [13]:
#构造标签集
y_train = df['Survived'].loc[train.index]

In [14]:
clf = tree.DecisionTreeClassifier()
grid = GridSearchCV(clf, cv=5, param_grid={
    'criterion': ['gini', 'entropy'], 
    'max_depth': [2, 3, 4, 5]})
grid.fit(x_train, y_train)
grid.best_params_

{'criterion': 'gini', 'max_depth': 3}

In [15]:
model = grid.best_estimator_

In [16]:
y_predict = model.predict(x_test).astype(int)
pd.DataFrame({'Survived': y_predict}, index=test.index) \
.reset_index() \
.to_csv(f'survived.csv', index=False)

这种方法虽然简单但是非常有效，没有像之前一样的很复杂地去填充缺失值，也没有很复杂地分段处理，而是很真实参考了实际生活中的场景，以家庭为单位，妇女小孩优先，非常有效地做出了预测；充分挖掘了名字当中含有的特征，以家庭为单位估计女人小孩的生存状况，但是也比较极端，基本默认测试机中成年男性的生存率为0；

最后在kaggle上有着0.80382的准确率，排名前7%。