# 1. 数据读取分析

In [56]:
import pandas
import numpy as np
titanic=pandas.read_csv("titanic/train.csv")
# 对数据进行一个大概统计，查看缺失值或是无意义值等
print(titanic.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


# 2. 数据预处理

In [57]:
# 发现age字段缺失，进行缺失值处理，填充中位数
titanic['Age']=titanic['Age'].fillna(titanic['Age'].median())
print(titanic.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.361582    0.523008   
std     257.353842    0.486592    0.836071   13.019697    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   22.000000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   35.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [58]:
# 将非数值型数据进行量化

# Sex量化
print(titanic['Sex'].unique())
titanic.loc[titanic['Sex']=='male','Sex']=0
titanic.loc[titanic['Sex']=='female','Sex']=1

# Embarked进行量化
print(titanic['Embarked'].unique())
titanic.loc[titanic['Embarked']=='S','Embarked']=0
titanic.loc[titanic['Embarked']=='C','Embarked']=1
titanic.loc[titanic['Embarked']=='Q','Embarked']=2

['male' 'female']
['S' 'C' 'Q' nan]


In [59]:
# 发现Embarked有缺失值，决定用较多的数量来填充，通过统计得出是0
max_Embarked=titanic.groupby('Embarked').count().sort_values('PassengerId',ascending=False).index[0]
print(max_Embarked)
titanic['Embarked']=titanic['Embarked'].fillna(max_Embarked)
print(titanic['Embarked'].unique())

0
[0 1 2]


# 3. 二分类模型建立

In [32]:
# 使用线性回归，以及使用交叉验证集来调参
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

In [36]:
# 选出的特征值
features=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']

# 使用线性模型
LR=LinearRegression()

# 使用三次交叉验证，分成三个部分，kf有共三组数据
kf=KFold(n_splits=3, shuffle=True)

predictions=[]
for train,test in kf.split(titanic):
    # 获取训练集及标记
    train_predictors=titanic[features].iloc[train,:]
    train_target=titanic['Survived'].iloc[train]
    # 用线性回归模型拟合
    LR.fit(train_predictors,train_target)
    test_predictions=LR.predict(titanic[features].iloc[test,:])
    predictions.append(test_predictions)