### 1. 数据预处理

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

## 读取数据信息
data = pd.read_csv("D:/cao/kaggle/Titanic/data/train.csv")
data.info()
data.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# 将 sex 特征转换为类别数据
data['Sex'] = data['Sex'].apply(lambda s:1 if s == 'male' else 0)
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [3]:
# 将所有缺失的字段填充为0,取部分特征用于训练
data = data.fillna(0)
dataset_X = data[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]
dataset_X = dataset_X.values
dataset_X

array([[ 1.    , 22.    ,  3.    ,  1.    ,  0.    ,  7.25  ],
       [ 0.    , 38.    ,  1.    ,  1.    ,  0.    , 71.2833],
       [ 0.    , 26.    ,  3.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [ 0.    ,  0.    ,  3.    ,  1.    ,  2.    , 23.45  ],
       [ 1.    , 26.    ,  1.    ,  0.    ,  0.    , 30.    ],
       [ 1.    , 32.    ,  3.    ,  0.    ,  0.    ,  7.75  ]])

In [4]:
# 两种分类结果分别是幸存和死亡，‘Survived’字段是其中一种分类标签
# 新增‘Deceased’表示第二种分类标签，取值为‘Survived’字段取非
data['Deceased'] = data['Survived'].apply(lambda s:int(not s))
data['Deceased']

0      1
1      0
2      0
3      0
4      1
5      1
6      1
7      1
8      0
9      0
10     0
11     0
12     1
13     1
14     1
15     0
16     1
17     0
18     1
19     0
20     1
21     0
22     0
23     0
24     1
25     0
26     1
27     1
28     0
29     1
      ..
861    1
862    0
863    1
864    1
865    0
866    0
867    1
868    1
869    0
870    1
871    0
872    1
873    1
874    0
875    0
876    1
877    1
878    1
879    0
880    0
881    1
882    1
883    1
884    1
885    1
886    1
887    0
888    1
889    0
890    1
Name: Deceased, Length: 891, dtype: int64

In [5]:
dataset_Y = data[['Deceased', 'Survived']]
dataset_Y = dataset_Y.values
dataset_Y

array([[1, 0],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [0, 1],
       [1, 0]], dtype=int64)

In [6]:
from sklearn.model_selection import train_test_split

# 划分为训练集和验证集
X_train, X_val, Y_train, Y_val = train_test_split(dataset_X, dataset_Y, test_size=0.2, random_state=42)

In [20]:
import tensorflow as tf

## 构建计算图

# 声明输入数据占位符
X = tf.placeholder(tf.float32, shape=[None, 6])
Y = tf.placeholder(tf.float32, shape=[None, 2])

# 声明变量
W1 = tf.Variable(tf.truncated_normal([6,4]), name='weights_1')
b1 = tf.Variable(tf.zeros([1,4]), name='bias_1')
W2 = tf.Variable(tf.truncated_normal([4,2]),name='weights_2')
b2 = tf.Variable(tf.zeros([2]), name='bias_2')

# 构造前向传播计算图
z1 = tf.matmul(X, W1) + b1
y_pred = tf.nn.softmax(tf.matmul(z1, W2) + b2)

# 使用交叉熵作为损失函数
cross_entropy = -tf.reduce_sum(Y * tf.log(y_pred + 1e-10),reduction_indices=1)

# 批量样本的代价值为所有样本交叉熵的平均值
cost = tf.reduce_mean(cross_entropy)

# 加入优化算法
train_op = tf.train.AdamOptimizer(0.001).minimize(cost)

In [21]:
## 训练迭代

with tf.Session() as sess:
    # 初始化所有变量
    tf.global_variables_initializer().run()
    
    # 迭代训练
    for epoch in range(500):
        total_loss = 0
        for i in range(len(X_train)):
            feed = {X:[X_train[i]], Y:[Y_train[i]]}
            _,loss = sess.run([train_op, cost], feed_dict=feed)
            total_loss += loss
        print('Epoch: %04d, total loss=%.9f' % (epoch + 1,total_loss))
    print('Training complete!')
    
    ## 评估验证集的准确率

    pred = sess.run(y_pred, feed_dict={X:X_val})
    correct = np.equal(np.argmax(pred, 1), np.argmax(Y_val, 1))
    accuracy = np.mean(correct.astype(np.float32))
    print("Accuracy on test set: %.9f" % accuracy)
    
    ## 测试测试集数据
    testdata = pd.read_csv("D:/cao/kaggle/Titanic/data/test.csv")
    testdata = testdata.fillna(0)
    # convert ['male', 'female'] values of Sex to [1, 0]
    testdata['Sex'] = testdata['Sex'].apply(lambda s: 1 if s == 'male' else 0)
    X_test = testdata[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]
    predictions = np.argmax(sess.run(y_pred, feed_dict={X: X_test}), 1)
    
    ## 将结果导出为csv文件
    submission = pd.DataFrame({
        "PassengerId": testdata["PassengerId"],
        "Survived": predictions
    })
    submission.to_csv("titanic-submission.csv", index=False)

Epoch: 0001, total loss=12.269404888
Epoch: 0002, total loss=5.602007091
Epoch: 0003, total loss=2.430033624
Epoch: 0004, total loss=1.159515411
Epoch: 0005, total loss=0.625280380
Epoch: 0006, total loss=0.367519569
Epoch: 0007, total loss=0.228344204
Epoch: 0008, total loss=0.147107247
Epoch: 0009, total loss=0.097106857
Epoch: 0010, total loss=0.065189701
Epoch: 0011, total loss=0.044290173
Epoch: 0012, total loss=0.030354802
Epoch: 0013, total loss=0.020940927
Epoch: 0014, total loss=0.014519999
Epoch: 0015, total loss=0.010108567
Epoch: 0016, total loss=0.007060369
Epoch: 0017, total loss=0.004945010
Epoch: 0018, total loss=0.003471826
Epoch: 0019, total loss=0.002442495
Epoch: 0020, total loss=0.001721793
Epoch: 0021, total loss=0.001215643
Epoch: 0022, total loss=0.000859656
Epoch: 0023, total loss=0.000608734
Epoch: 0024, total loss=0.000431445
Epoch: 0025, total loss=0.000306382
Epoch: 0026, total loss=0.000217564
Epoch: 0027, total loss=0.000154857
Epoch: 0028, total loss=0.0

KeyboardInterrupt: 