In [739]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn as sklearn

In [740]:
train_data_df = pd.read_csv("train.csv", header =0)
test_data_df = pd.read_csv("test.csv", header = 0)


In [741]:
def transform_input_data(df):
    #transform gender column into categorical values
    sex_categorical_cols = pd.get_dummies(df['Sex'], prefix='sex', drop_first=False)
    df = pd.concat([df, sex_categorical_cols], axis = 1)

    #transform fare column into fare class
    df['FareBucket'] = df['Fare']-(df['Fare']%10)
    
    #transform age into age bucket
    df['AgeBucket'] = df['Age']-(df['Age']%10)
    
    #treat missing Embarked
    df['EmbarkedVal'] = df.Embarked.map({'C':0, 'Q':1, 'S':2})
    embarkedMedian = df.EmbarkedVal.dropna().median()
    df.loc[(df.EmbarkedVal.isnull()), 'EmbarkedVal'] = embarkedMedian
    
    #transform embarked into categorical
    embarked_categorical_cols = pd.get_dummies(df['EmbarkedVal'], prefix='embarkedval', drop_first=False)
    df = pd.concat([df, embarked_categorical_cols], axis=1)
    
    #treat missing ages
    girls_age_median = df[(df.Name.str.contains('Miss'))]['Age'].dropna().median()
    boys_age_median = df[(df.Name.str.contains('Master'))]['Age'].dropna().median()
    male_age_median = df[(df['Sex'] == 'male')]['Age'].dropna().median()
    female_age_median = df[(df['Sex'] == 'female')]['Age'].dropna().median()
    
    df.loc[((df.Age.isnull()) & (df.Sex == 'male')), 'AgeBucket'] = male_age_median - (male_age_median%10)
    df.loc[((df.Age.isnull()) & (df.Sex == 'female')), 'AgeBucket'] = female_age_median - (female_age_median%10)
    
    df.loc[((df.Age.isnull()) & (df.Name.str.contains('Miss'))), 'AgeBucket'] = girls_age_median - (girls_age_median%10)
    df.loc[((df.Age.isnull()) & (df.Name.str.contains('Master'))), 'AgeBucket'] = boys_age_median - (boys_age_median%10)
    
    #treat missing fare buckets
    for pclass in range(3):
        classFareMean = df[(df.Pclass == pclass+1)]['Fare'].dropna().mean()
        df.loc[((df.Fare.isnull()) & (df.Pclass == pclass+1)), 'FareBucket'] =  classFareMean- (classFareMean%10)
    
    #transform embarked into categorical
    sibsp_categorical_cols = pd.get_dummies(df['SibSp'], prefix='sibsp', drop_first=False)
    df = pd.concat([df, sibsp_categorical_cols], axis=1)
    
    #transform embarked into categorical
    pclass_categorical_cols = pd.get_dummies(df['Pclass'], prefix='pclass', drop_first=False)
    df = pd.concat([df, pclass_categorical_cols], axis=1)
    
    #transform embarked into categorical
    #parch_categorical_cols = pd.get_dummies(df['Parch'], prefix='parch', drop_first=False)
    #df = pd.concat([df, parch_categorical_cols], axis=1)
    
    #transform cabin info
    df['CabinInfo'] = train_data_df.Cabin.astype(str).str[0] #just fetching the cabin A/B/C/D etc and leaving nan as they are and they ll be treated as another class
    cabininfo_categorical_cols = pd.get_dummies(df['CabinInfo'], prefix='cabininfo', drop_first=False)
    df = pd.concat([df, cabininfo_categorical_cols], axis=1)
    
    #drop other cols
    cols_to_drop = ['Name', 'Sex', 'Age', 'SibSp', 'Ticket', 'Fare', 'Pclass', 'Cabin', 'CabinInfo', 'Embarked', 'EmbarkedVal', 'PassengerId']
    df = df.drop(cols_to_drop, axis=1)
    
    return df

In [742]:
def transform_output_data(df):
    #transform gender column into categorical values
    survived_categorical_cols = pd.get_dummies(df['Survived'], prefix='survived', drop_first=False)
    df = pd.concat([df, survived_categorical_cols], axis = 1)
    df = df.drop(['Survived'], axis=1)
    return df

In [743]:
# y train
train_data_y = transform_output_data(pd.DataFrame(train_data_df['Survived']))

# x train
train_data_x = transform_input_data(train_data_df)
train_data_x = train_data_x.drop(['Survived'], axis=1)

# x test
test_data_x = transform_input_data(test_data_df)

In [744]:
#train_data_x
#train_data_y
#train_data_x['FareBucket'].unique()
#train_data_df['Cabin'].unique()
#train_data_df[(train_data_df['Cabin'].isnull())]
#rain_data_df['Parch'].unique()
#train_data_df['CabinInfo'] = train_data_df.Cabin[0]
#train_data_df['CabinInfo'] = train_data_df.Cabin.astype(str).str[0]
#train_data_df['CabinInfo'].unique()
#test_data_x

In [796]:
#start preparing for tensorflow model
X_train_total = train_data_x.values
Y_train_total = train_data_y.values

records = len(X_train_total)
train_test_split_per = 0.9

# training data sets
X_train = X_train_total[:int(records*train_test_split_per)]
Y_train = Y_train_total[:int(records*train_test_split_per)]

# local test data sets
X_test_0 = X_train_total[int(records*train_test_split_per):]
Y_test_0 = Y_train_total[int(records*train_test_split_per):]

# kaggle test data set that should be predicted
X_test_final = test_data_x.values

#hyper parameters
learning_rate = 0.01
epochs = 150
batch_size = 128
display_step = 1

#hyper parameter selection options
#learning_rates = [0.1, 0.09, 0.01, 0.009, 0.001, 0.0001]
#epoch_list = [10, 50, 100, 150]
#batch_sizes = [128, 256, 512]


#shapes of inputs and weights
n_input_features = X_train.shape[1]
n_classes = Y_train.shape[1]

#n_hidden_layer = int(round((n_input_features + n_classes) / 2)) #an approximation to start with
n_hidden_layer = 15

In [797]:
n_hidden_layer

15

In [798]:
#form weights and biases variables
weights = {
    'hidden_layer': tf.Variable(tf.random_normal([n_input_features, n_hidden_layer])),
    'output_layer': tf.Variable(tf.random_normal([n_hidden_layer, n_classes]))
}
biases = {
    'hidden_layer': tf.Variable(tf.random_normal([n_hidden_layer])),
    'output_layer': tf.Variable(tf.random_normal([n_classes]))
}

#tf model
x = tf.placeholder(tf.float32, [None, n_input_features])
y = tf.placeholder(tf.float32, [None, n_classes])

#lr = tf.placeholder(tf.float32)

#hidden layer with ReLU
hidden_layer = tf.add(tf.matmul(x, weights['hidden_layer']), biases['hidden_layer'])
hidden_layer = tf.nn.relu(hidden_layer)
#output layer
logits = tf.add(tf.matmul(hidden_layer, weights['output_layer']), biases['output_layer'])

predictions = tf.argmax(tf.nn.softmax(logits=logits),1)
#cost
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
#optimizer 
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

#init
init = tf.global_variables_initializer()

#accuracy and predictions
correct_predictions = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))



In [799]:
import math
def get_batches(batch_size, features, labels):
    batch_count = math.ceil(len(features)/batch_size)
    batches = []
    for i in range(batch_count):
        feature_set, labels_set = [],[]
        if i == batch_count - 1:
            feature_set = features[i*batch_size:][:]
            labels_set = labels[i*batch_size:][:]
        else:
            feature_set = features[i*batch_size:(i+1)*batch_size][:]
            labels_set = labels[i*batch_size:(i+1)*batch_size][:]
        current_batch = [feature_set, labels_set]
        batches.append(current_batch)
    return batches 

In [800]:
#start tf session
train_batches = get_batches(batch_size, X_train, Y_train)
   
with tf.Session() as session:
    session.run(init)
    
    #each training cycle
    for epoch_i in range(epochs):
        
        #for all batches
        for batch_features, batch_labels in train_batches:
            
            indices = np.arange(len(batch_features))
            np.random.shuffle(indices)
            
            cv_split_index = int(0.9*(len(batch_features))) #split into cross validation set and training set for this epoch & batch
            
            train_batch_features = batch_features[indices[:cv_split_index]]
            train_batch_labels = batch_labels[indices[:cv_split_index]]
            
            val_batch_features = batch_features[indices[cv_split_index:]]
            val_batch_labels = batch_labels[indices[cv_split_index:]]
            
            train_feed_dict = {x: train_batch_features, y: train_batch_labels}
            #run the optimizer
            session.run(optimizer, feed_dict=train_feed_dict)
            
        # Display logs per epoch step
        if epoch_i % display_step == 0:
            current_cost = session.run(cost, feed_dict={x: train_batch_features, y: train_batch_labels})
            validation_accuracy = session.run(accuracy,  feed_dict={x: val_batch_features, y: val_batch_labels})
            print("Epoch:", '%04d' % (epoch_i+1), "cost=", "{:.9f}".format(current_cost), "val acc:", "{:.9f}".format(validation_accuracy))
            
    print("Optimization Finished!")
    
    #testing the model
    
    test_accuracy = session.run(accuracy,  feed_dict={x: X_test_0, y: Y_test_0, lr: learn_rate})
    print("Test Accuracy: {}".format(test_accuracy))
    
    #predict for test data - kaggle output data
    output = session.run(predictions, feed_dict={x: X_test_final, lr: learn_rate})
    print(output)

Epoch: 0001 cost= 11.795425415 val acc: nan
Epoch: 0002 cost= 18.160991669 val acc: nan
Epoch: 0003 cost= 7.282359600 val acc: nan
Epoch: 0004 cost= 6.239048004 val acc: nan
Epoch: 0005 cost= 4.836873531 val acc: nan
Epoch: 0006 cost= 1.495831251 val acc: nan
Epoch: 0007 cost= 3.208735228 val acc: nan
Epoch: 0008 cost= 5.939539433 val acc: nan
Epoch: 0009 cost= 4.376293182 val acc: nan
Epoch: 0010 cost= 1.727697611 val acc: nan
Epoch: 0011 cost= 2.343883753 val acc: nan
Epoch: 0012 cost= 1.088119864 val acc: nan
Epoch: 0013 cost= 1.458680391 val acc: nan
Epoch: 0014 cost= 2.924477816 val acc: nan
Epoch: 0015 cost= 1.019582152 val acc: nan
Epoch: 0016 cost= 1.745906830 val acc: nan
Epoch: 0017 cost= 0.733402967 val acc: nan
Epoch: 0018 cost= 0.927239776 val acc: nan
Epoch: 0019 cost= 1.061872244 val acc: nan
Epoch: 0020 cost= 0.875206947 val acc: nan
Epoch: 0021 cost= 0.703107655 val acc: nan
Epoch: 0022 cost= 0.872576952 val acc: nan
Epoch: 0023 cost= 0.999517024 val acc: nan
Epoch: 00

In [793]:
output

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0,

In [794]:
result = pd.concat((test_data_df['PassengerId'], pd.DataFrame(output)), axis=1)
result.to_csv('result_pd.csv', index_label=False, index =False)