In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline

In [2]:
train_df = pd.read_csv('./titanicTrain.csv')
test_df = pd.read_csv('./titanicQuestion.csv')

In [3]:
train_df = train_df[:1000]
print('Train:', len(train_df),
      'Test:', len(test_df))

Train: 1000 Test: 309


## preprocessing function

In [4]:
# Titanic data preprocessing function
def preprocessTitanicData(raw_df):
    df = raw_df[['survived', 'pclass', 'sibsp', 'parch', 'age', 'sex', 'fare', 'embarked']]
    df['age'] = df['age'].fillna(df['age'].median())
    df['fare'] = df['fare'].fillna(df['fare'].median())
    df['sex'] = df['sex'].map({'female':0, 'male':1}).astype(int)
    x_onehot_df = pd.get_dummies(data=df, columns= ['embarked'])
    ndarray = x_onehot_df.values
    feature = ndarray[:,1:]
    label = ndarray[:, 0]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaledFeatures = minmax_scale.fit_transform(feature)
    
    return scaledFeatures, label

In [5]:
train_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [6]:
x_train, y_train = preprocessTitanicData(train_df)
x_test, y_test = preprocessTitanicData(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [7]:
print(x_train.shape, y_train.shape) #np.array np.array
print(x_test.shape, y_test.shape) # np.array np.array

# reshape y_train and y_test
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

print(x_train.shape, y_train.shape) #np.array np.array
print(x_test.shape, y_test.shape) # np.array np.array

(1000, 9) (1000,)
(309, 9) (309,)
(1000, 9) (1000, 1)
(309, 9) (309, 1)


## construct model

In [8]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torchvision.transforms as transforms

In [9]:
DROPOUT_RATIO = 0.35

In [19]:
class LR(nn.Module): # 繼承torch.nn.module特性
    
    # 架構
    def __init__(self):
        super(LR, self).__init__() # super(LR, self) 繼承自己的__init__()
        self.hidden1 = nn.Linear(9, 100) # input 9, hidden 40  Linear全連接
        self.dropout = nn.Dropout(DROPOUT_RATIO)
        self.hidden2 = nn.Linear(100, 80) # input 40, hidden 40  Linear全連接
       # self.hidden22 = nn.Linear(100, 40)
        self.hidden3 = nn.Linear(80, 60) # input 40, hidden 40  Linear全連接
       # self.hidden3 = nn.Bilinear(40, 40, 30)
        self.hidden4 = nn.Linear(60, 40) # input 40, hidden 30  Linear全連接
        self.hidden5 = nn.Linear(40, 1) # hidden 10, output 1

    # 運作流程（資料流）
    def forward(self, x):  
        x = F.sigmoid(self.hidden1(x)) # 讓x由hidden1後，進入activation function
        x = F.dropout(self.dropout(x))
        
        #y = F.relu(self.hidden22(x))
        #y = F.dropout(self.dropout(y))
        
        x = F.relu(self.hidden2(x))
        x = F.dropout(self.dropout(x))
        
        x = F.relu(self.hidden3(x))
       # x = F.relu(self.hidden3(x, y))
        x = F.dropout(self.dropout(x))
        x = F.relu(self.hidden4(x))
        out = self.hidden5(x)
        return out
    

In [20]:
learning_rate = 0.03
model = LR()
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) #model.parameters() 所有訓練的參數都在這
# model.cuda()   

## Training

In [12]:
epochs = 5000

In [None]:
for epoch in range(epochs):
    epoch += 1

    # convert to variables
    x = Variable(torch.from_numpy(x_train).float())
    y = Variable(torch.from_numpy(y_train).float())

    # clear gradient w.r.t. parameters 
    optimizer.zero_grad()
    
    # forward to get output
    prediction = model(x)

    # calculate loss
    loss = loss_func(prediction, y)

    # backward to get gradient
    loss.backward()

    # update parameters
    optimizer.step() 

    if epoch % 10 == 0:
    # plot and show learning process
        print("epoch %d, loss %.8f" % (epoch, loss.data[0]))



epoch 10, loss 0.28219002
epoch 20, loss 0.24655639
epoch 30, loss 0.24781515
epoch 40, loss 0.24587201
epoch 50, loss 0.24994576
epoch 60, loss 0.24631521
epoch 70, loss 0.24463409
epoch 80, loss 0.24754903
epoch 90, loss 0.24354088
epoch 100, loss 0.24519087
epoch 110, loss 0.24700919
epoch 120, loss 0.24538316
epoch 130, loss 0.24729630
epoch 140, loss 0.24451779
epoch 150, loss 0.24543978
epoch 160, loss 0.24292561
epoch 170, loss 0.24373959
epoch 180, loss 0.24396703
epoch 190, loss 0.24334593
epoch 200, loss 0.24378631
epoch 210, loss 0.24251480
epoch 220, loss 0.24504206
epoch 230, loss 0.24159156
epoch 240, loss 0.24403045
epoch 250, loss 0.24438182
epoch 260, loss 0.24389969
epoch 270, loss 0.24534835
epoch 280, loss 0.24290611
epoch 290, loss 0.24462435
epoch 300, loss 0.24376132
epoch 310, loss 0.24439232
epoch 320, loss 0.24426927
epoch 330, loss 0.24163643
epoch 340, loss 0.24337910
epoch 350, loss 0.24501459
epoch 360, loss 0.24487627
epoch 370, loss 0.24312383
epoch 380,

In [None]:
# get filters from parameters:
print(list(model.parameters())[0].size())  

In [None]:
train_prediction = prediction.data.numpy()
train_prediction = np.where(train_prediction>=0.7,1,0)
train_df['prediction'] = train_prediction

## Prediction

In [None]:
x_test_v = Variable(torch.from_numpy(x_test).float())
test_prediction = model(x_test_v).data.numpy()
# test_prediction = np.where(test_prediction>=0.6,1,0)

In [None]:
test_df['prediction'] = test_prediction

In [None]:
def apply_sub(row, label):
    if row[label] >= 0.5:
        return 1
    else:
        return 0

test_df['prediction'] = test_df.apply(lambda x: apply_sub(x, 'prediction'), axis = 1)
test_df['prediction']

In [None]:
#test_df['prediction'].to_csv('titanic_answer_trained.csv')

## Verify model

In [None]:
answer = pd.read_csv('titanicAns.csv')
accuracy = 1.0 - sum(abs(answer['survived'] - test_df['prediction'])) / test_df.shape[0]
print('Accuracy: %f %%'%(accuracy*100))

### dump accuracy

In [None]:
with open('log.txt', 'a') as flog:
    flog.write('RESULT:\n\tDropout:\t%.3f\n\tepochs:\t\t%ilearning_rate:\t%f\n\tAccuracy:\t%f %%\n'%(DROPOUT_RATIO, epochs, learning_rate, accuracy*100))

In [None]:
with open('log.txt', 'r') as flog:
    print(flog.read())