In [1]:
import numpy
import pandas as pd
from sklearn import preprocessing
numpy.random.seed(10)

In [2]:
#讀取excel資料集
filepath = "/Users/PChomeIM/pywork/Dataset/titanic3.xls"
all_df = pd.read_excel(filepath)

In [3]:
#挑選有意義的欄位
cols = ["survived", "name", "pclass", "sex", "age", "sibsp", "parch", "fare", "embarked"]
all_df = all_df[cols]

In [4]:
#資料預處理
def PreprocessData(raw_df):
    #訓練時，暫時不需要name欄位
    df = raw_df.drop(['name'], axis=1)
    #將age欄位的null值填上平均值
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    #將fare欄位的null值填上平均值
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    #轉換sex欄位為0和1
    df['sex'] = df['sex'].map({'female':0, 'male':1}).astype(int)
    #將embarked欄位轉成onehot表示
    onehot_df = pd.get_dummies(data=df, columns=['embarked'])
    
    #將df轉換成array
    ndarray = onehot_df.values
    features = ndarray[:,1:] #取第1欄以後的欄位作為features
    label = ndarray[:,0]     #取第0欄survived作為label
    
    #將特徵的值進行標準化
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
    features = minmax_scale.fit_transform(features)
    
    return features, label

In [5]:
#將資料分訓練集和測試集
msk = numpy.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]

In [6]:
print('total:', len(all_df),
      'train:', len(train_df),
      'test:', len(test_df))

total: 1309 train: 1034 test: 275


In [7]:
train_feature, train_label = PreprocessData(train_df)
test_feature, test_label = PreprocessData(test_df)

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


In [9]:
#建立一個線性堆疊模型
model = Sequential()

In [10]:
#建立輸入層和第1層隱藏層
model.add(Dense(units=40,input_dim=9,
                kernel_initializer='uniform',
                activation='relu'))

In [11]:
#建立第2層隱藏層
model.add(Dense(units=30,
                kernel_initializer='uniform',
                activation='relu'))

In [12]:
#建立輸出層
model.add(Dense(units=1,
                kernel_initializer='uniform',
                activation='sigmoid'))

In [13]:
#定義訓練方式
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])

In [14]:
#開始訓練
train_history = model.fit(x=train_feature,
                          y=train_label,
                          validation_split=0.1,
                          epochs=30,
                          batch_size=30,
                          verbose=2)

Train on 930 samples, validate on 104 samples
Epoch 1/30
0s - loss: 0.6899 - acc: 0.5774 - val_loss: 0.6704 - val_acc: 0.7885
Epoch 2/30
0s - loss: 0.6666 - acc: 0.6022 - val_loss: 0.5886 - val_acc: 0.7885
Epoch 3/30
0s - loss: 0.6062 - acc: 0.6720 - val_loss: 0.4924 - val_acc: 0.8077
Epoch 4/30
0s - loss: 0.5404 - acc: 0.7677 - val_loss: 0.4629 - val_acc: 0.7596
Epoch 5/30
0s - loss: 0.5031 - acc: 0.7624 - val_loss: 0.4538 - val_acc: 0.7885
Epoch 6/30
0s - loss: 0.4892 - acc: 0.7634 - val_loss: 0.4401 - val_acc: 0.7788
Epoch 7/30
0s - loss: 0.4811 - acc: 0.7602 - val_loss: 0.4433 - val_acc: 0.7885
Epoch 8/30
0s - loss: 0.4781 - acc: 0.7548 - val_loss: 0.4360 - val_acc: 0.7885
Epoch 9/30
0s - loss: 0.4721 - acc: 0.7645 - val_loss: 0.4283 - val_acc: 0.7788
Epoch 10/30
0s - loss: 0.4690 - acc: 0.7753 - val_loss: 0.4265 - val_acc: 0.8077
Epoch 11/30
0s - loss: 0.4651 - acc: 0.7667 - val_loss: 0.4241 - val_acc: 0.8173
Epoch 12/30
0s - loss: 0.4632 - acc: 0.7763 - val_loss: 0.4206 - val_acc

In [15]:
#測試模型準確度
scores = model.evaluate(x=test_feature, y=test_label)

 32/275 [==>...........................] - ETA: 0s

In [16]:
scores[1]

0.80363636406985195

In [17]:
#執行預測
prediction = model.predict(test_feature)

In [18]:
prediction[:5]

array([[ 0.98788005],
       [ 0.13495834],
       [ 0.9829573 ],
       [ 0.28999779],
       [ 0.97594833]], dtype=float32)

In [19]:
#儲存整個模型，包含結構、權重、損失函數和最佳化方法
model.save("/Users/PChomeIM/pywork/SaveModel/TitanicMLPModel.h5")