In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pp
%matplotlib inline

In [2]:
import tensorflow as tf

In [3]:
df_train = pd.read_csv('titanic_train.csv')
df_test = pd.read_csv('titanic_test.csv')

In [4]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df_train.loc[df_train['Embarked'].isnull(), 'Embarked'] = 'S'
df_test.loc[df_test['Embarked'].isnull(), 'Embarked'] = 'S'

In [6]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [7]:
df_train.loc[df_train['Cabin'].isnull() , 'Cabin'] = 'N'
df_test.loc[df_test['Cabin'].isnull(), 'Cabin'] = 'N'
#df_train['Cabin'] = df_train['Cabin'].astype(str).str[0].apply(lambda x : x.upper())

In [8]:
def imputeAge(cols):
    Pclass = cols[0]
    Age = cols[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return 38
        elif Pclass == 2:
            return 29
        elif Pclass == 3:
            return 24
    else:
        return Age


In [9]:
df_train['Age'] = df_train[['Pclass','Age']].apply(imputeAge, axis = 1)
df_test['Age'] = df_test[['Pclass','Age']].apply(imputeAge, axis = 1)


In [10]:
df_train['LastName'] = df_train['Name'].apply(lambda x: x.split(',')[0])
df_train.drop(['Name'], inplace=True, axis = 1)
df_test['LastName'] = df_test['Name'].apply(lambda x: x.split(',')[0])
df_test.drop(['Name'], inplace=True, axis = 1)

In [11]:
def imputeTicket(value):
    tokens = value.split(' ')
    if len(tokens) > 1:
        return tokens[1]
    return value

df_train.drop(['Ticket'], inplace=True, axis=1)
df_test.drop(['Ticket'], inplace=True, axis=1)
#df_train['Ticket'] = df_train['Ticket'].apply(imputeTicket)


In [12]:
#df_train['Ticket'].apply(lambda x : not x.isdigit()).sum()
df_train['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)

In [13]:
X = df_train.drop(['PassengerId', 'Survived'], axis = 1)
y = df_train['Survived']


In [14]:
# Tensor flow features columns should be added in the same order as data frame
feat_cols = []

pclass_cat_col = tf.feature_column.categorical_column_with_identity(key='Pclass', num_buckets=4)
feat_cols.append(tf.feature_column.indicator_column(pclass_cat_col))

sex_cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key='Sex', vocabulary_list=["male", "female"])
feat_cols.append(tf.feature_column.indicator_column(sex_cat_col))

numeric_age_column = tf.feature_column.numeric_column("Age")
#feat_cols.append(numeric_age_column)
feat_cols.append(tf.feature_column.bucketized_column(source_column = numeric_age_column,\
                                                                       boundaries = np.arange(10,80,10).tolist()))

feat_cols.append(tf.feature_column.numeric_column('SibSp'))
feat_cols.append(tf.feature_column.numeric_column('Parch'))

#ticket_cat_col=tf.feature_column.categorical_column_with_hash_bucket(  \
#                                                                    key = "Ticket",  \
#                                                                    hash_bucket_size = 10)

#feat_cols.append(tf.feature_column.indicator_column(ticket_cat_col))

feat_cols.append(tf.feature_column.numeric_column('Fare'))

cab_cat_col = tf.feature_column.categorical_column_with_vocabulary_list(  \
                                key='Cabin',    \
                                vocabulary_list= X['Cabin'].unique())
feat_cols.append(tf.feature_column.indicator_column(cab_cat_col))

emb_cat_col = tf.feature_column.categorical_column_with_vocabulary_list(  \
                                key='Embarked',  \
                                vocabulary_list=['S','C', 'Q'])
feat_cols.append(tf.feature_column.indicator_column(emb_cat_col))

lname_cat_col=tf.feature_column.categorical_column_with_hash_bucket(  \
                                                                    key = "LastName",  \
                                                                    hash_bucket_size = 10)
feat_cols.append(tf.feature_column.indicator_column(lname_cat_col))

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=101)

In [17]:
train_func=tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=128,num_epochs=5,shuffle=True)

In [18]:
dnn = tf.estimator.DNNClassifier(hidden_units=[10,20,10],feature_columns=feat_cols, n_classes=2, \
                                 optimizer='Adam', \
                                 batch_norm=True)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\ashish\\AppData\\Local\\Temp\\tmpjk0m2le9', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001F748575DA0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [19]:
dnn.train(input_fn=train_func)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into C:\Users\ashish\AppData\Local\Temp\tmpjk0m2le9\model.ckpt.
INFO:tensorflow:loss = 91.86604, step = 1
INFO:tensorflow:Saving checkpoints for 25 into C:\Users\ashish\AppData\Local\Temp\tmpjk0m2le9\model.ckpt.
INFO:tensorflow:Loss for final step: 16.760506.


<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1f7485759e8>

In [20]:
test_func = tf.estimator.inputs.pandas_input_fn(x=X_test, batch_size=X_test.shape[0],num_epochs=1, shuffle=False)

In [21]:
predictions = list(dnn.predict(input_fn=test_func))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ashish\AppData\Local\Temp\tmpjk0m2le9\model.ckpt-25
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [22]:
predictions[:1]

[{'logits': array([-0.7344139], dtype=float32),
  'logistic': array([0.3242269], dtype=float32),
  'probabilities': array([0.6757731 , 0.32422686], dtype=float32),
  'class_ids': array([0], dtype=int64),
  'classes': array([b'0'], dtype=object)}]

In [23]:
y_pred = [i['class_ids'][0] for i in predictions]

In [24]:
np.mean(y_test != y_pred)

0.2574626865671642

In [25]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [26]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

[[123  31]
 [ 38  76]]
              precision    recall  f1-score   support

           0       0.76      0.80      0.78       154
           1       0.71      0.67      0.69       114

   micro avg       0.74      0.74      0.74       268
   macro avg       0.74      0.73      0.73       268
weighted avg       0.74      0.74      0.74       268



0.7425373134328358

In [27]:
X_cv = df_test.drop(['PassengerId'], axis = 1)

In [28]:
X_cv.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,LastName
0,3,male,34.5,0,0,7.8292,N,Q,Kelly
1,3,female,47.0,1,0,7.0,N,S,Wilkes
2,2,male,62.0,0,0,9.6875,N,Q,Myles
3,3,male,27.0,0,0,8.6625,N,S,Wirz
4,3,female,22.0,1,1,12.2875,N,S,Hirvonen


In [29]:
cv_func = tf.estimator.inputs.pandas_input_fn(x=X_cv,batch_size=X_cv.shape[0],num_epochs=1,shuffle=False)

In [31]:
predictions = list(dnn.predict(input_fn=cv_func))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ashish\AppData\Local\Temp\tmpjk0m2le9\model.ckpt-25
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [32]:
y_cv = [i['class_ids'][0] for i in predictions]

In [33]:
y_cv[:10]

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [34]:
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_cv})
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [35]:
submission.to_csv('submission_tf.csv', index=False)