In [1]:
import pandas as pd
def clean_data(data_path, train_data=True):
    data = pd.read_csv(data_path)

    # add family column
    for i in range(data.shape[0]):
        data.at[i, 'Family'] = data.at[i, 'Name'].split(', ')[0]

    #add deck feature
    for i in range(data.shape[0]):
        if pd.isnull(data.at[i, 'Cabin']):
            data.at[i, 'Deck'] = 'U'
        else:
            data.at[i, 'Deck'] = list(data.at[i, 'Cabin'])[0]
    #fill missing values in age column
    data['Age'] = data['Age'].fillna(data['Age'].mean())
    data.fillna('', inplace=True)
    #add family members feature
    data['FamilyMembers'] = data.groupby('Family')['Family'].transform('count')
    #data['PassengerId'] = data['PassengerId'].astype('str')
    if train_data:
        data = data[['PassengerId', 'Family', 'Pclass',
                     'Sex', 'Age', 'FamilyMembers',
                     'Deck', 'Embarked', 'Survived']]
    else:
        data = data[['PassengerId', 'Family', 'Pclass',
                     'Sex', 'Age', 'FamilyMembers', 'Embarked',
                     'Deck']]
    return data


In [2]:
data = clean_data('train.csv')
train_df = data.sample(frac=0.9,random_state=0)
test_df = data.drop(train_df.index)
train_labels = train_df.pop('Survived')
test_labels = test_df.pop('Survived')
train_y = train_labels
test_y = test_labels
train_x = train_df
test_x = test_df

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Family,Pclass,Sex,Age,FamilyMembers,Deck,Embarked
495,496,Yousseff,3,male,29.699118,1,U,C
648,649,Willey,3,male,29.699118,1,U,S
278,279,Rice,3,male,7.0,5,U,Q
31,32,Spencer,1,female,29.699118,1,B,C
255,256,Touma,3,female,29.0,1,U,C


In [4]:
import titanic_estimator

titanic_estimator.classifier.train(
    input_fn=lambda: titanic_estimator.input_fn(train_x, train_y, batch_size=50),
    max_steps = 4501
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Aryan\\Source\\Kaggle\\titanic\\titanic_model\\test', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000026A0BDF3128>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Create a `tf.sparse.SparseTens

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x26a09a5ae48>

In [5]:
import titanic_estimator
eval_result = titanic_estimator.classifier.evaluate(
    input_fn=lambda: titanic_estimator.eval_input_fn(test_x, test_y, batch_size=50))
print(eval_result)
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-02-26-07:27:59
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Aryan\Source\Kaggle\titanic\titanic_model\test\model.ckpt-4501
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-02-26-07:27:59
INFO:tensorflow:Saving dict for global step 4501: accuracy = 0.8426966, accuracy_baseline = 0.5280899, auc = 0.8779128, auc_precision_recall = 0.90474355, average_loss = 1.3729471, global_step = 4501, label/mean = 0.47191012, loss = 61.096146, precision = 0.8684211, prediction/mean = 0.42663875, recall = 0.78571427
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 4501: C:\Users\Aryan\Source\Kaggle\titanic\titanic_model\test\model.ckpt-4501
{'accuracy': 0.8426966, 'accuracy_baseline': 0.5280899, 'auc': 0.8779128, 'auc_precision_recall': 0.90474355, 'aver

In [6]:
#read test dataset
test_data_path = 'test.csv'
test = clean_data(test_data_path, train_data=False)

In [7]:
predictions = titanic_estimator.classifier.predict(
    input_fn=lambda: titanic_estimator.eval_input_fn(test, labels=None, batch_size=1000)
)

In [8]:
preds = list(predictions)
for i in range(len(preds)):
    if preds[i]['class_ids'][0]:
        test.at[i, 'Survived'] = 1
    else:
        test.at[i, 'Survived'] = 0

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Aryan\Source\Kaggle\titanic\titanic_model\test\model.ckpt-4501
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [9]:
test_to_submit = pd.read_csv('test.csv')
test_to_submit['Survived'] = test['Survived'].astype('int')
test_to_submit = test_to_submit[['PassengerId', 'Survived']]

In [10]:
test_to_submit.to_csv('submission_15.csv', index=False)

In [None]:
test_to_submit