In [40]:
# Required imports.
import pandas as pd
import numpy as np
import tensorflow as tf

In [41]:
complete_train_data = pd.read_csv('../input/train.csv')


In [42]:
complete_train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [43]:
complete_train_data['Sex_numeric'] = 1
complete_train_data.loc[complete_train_data['Sex'] == 'female','Sex_numeric'] = 0

In [44]:
REL_COLUMNS = ['Sex_numeric', 'Pclass']

### Check for data sanity before we move on.

In [45]:
complete_train_data[['Sex_numeric']].describe()

Unnamed: 0,Sex_numeric
count,891.0
mean,0.647587
std,0.47799
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [46]:
complete_train_data['Sex_numeric'].isnull().values.any()


False

In [47]:
complete_train_data['Sex_numeric'].unique()

array([1, 0])

In [48]:
complete_train_data[['Pclass']].describe()

Unnamed: 0,Pclass
count,891.0
mean,2.308642
std,0.836071
min,1.0
25%,2.0
50%,3.0
75%,3.0
max,3.0


In [49]:
complete_train_data['Pclass'].isnull().values.any()

False

In [50]:
complete_train_data['Pclass'].unique()

array([3, 1, 2])

In [51]:
complete_train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex_numeric'],
      dtype='object')

In [52]:
LABEL = 'Survived'

### Going to tensorflow specifics

In [53]:
NUM_ENTRIES = len(complete_train_data)
NUM_TRAIN_ENTRIES = int(0.8 * NUM_ENTRIES)

train_df = complete_train_data[:NUM_TRAIN_ENTRIES]
valid_df = complete_train_data[NUM_TRAIN_ENTRIES:]

In [54]:
NUM_ENTRIES

891

In [55]:
NUM_TRAIN_ENTRIES

712

In [56]:
len(train_df)

712

In [57]:
len(valid_df)

179

In [58]:
def make_train_input_fn(df, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
        x=df,
        y=df[LABEL],
        shuffle=True,
        num_epochs=num_epochs)

In [59]:
def make_eval_input_fn(df):
    return tf.estimator.inputs.pandas_input_fn(
        x=df,
        y=df[LABEL],
        shuffle=False)

In [60]:
def make_prediction_input_fn(df):
    return tf.estimator.inputs.pandas_input_fn(
        x=df,
        y=None,
        shuffle=False)    

In [61]:
def make_features():
    input_cols = [tf.feature_column.numeric_column(col) for col in REL_COLUMNS]
    return input_cols
# TEST : Can have categorical columns with vocabulary list as well here.

In [62]:
[x for x in np.arange(0, 7, 1)]

[0, 1, 2, 3, 4, 5, 6]

In [63]:
def get_bucketize_col(col_name):
    if col_name == 'Sex_numeric':
        bucket_col = tf.feature_column.bucketized_column(source_column=tf.feature_column.numeric_column(col_name), 
                                                         boundaries=[0,1])
    elif col_name == 'Pclass':
        bucket_col = tf.feature_column.bucketized_column(source_column=tf.feature_column.numeric_column(col_name), 
                                                         boundaries=[1,2,3])
    elif col_name == 'Age':
        bucket_col = tf.feature_column.bucketized_column(source_column=tf.feature_column.numeric_column(col_name), 
                                                         boundaries=[x for x in np.arange(0, 80, 10)])
    elif col_name == 'Fare':
        boundaries = [x for x in np.arange(0, 50, 10)]
        boundaries.extend([70, 100, 120, 140, 200, 300, 500])
        print(boundaries)
        bucket_col = tf.feature_column.bucketized_column(source_column=tf.feature_column.numeric_column(col_name), 
                                                         boundaries=boundaries)
    elif col_name == 'Parch':
        bucket_col = tf.feature_column.bucketized_column(source_column=tf.feature_column.numeric_column(col_name), 
                                                         boundaries=[x for x in np.arange(0, 7, 1)])
    elif col_name == 'SibSp':
        bucket_col = tf.feature_column.bucketized_column(source_column=tf.feature_column.numeric_column(col_name), 
                                                         boundaries=[x for x in np.arange(0, 9, 1)])
        
        
    return bucket_col

In [64]:
def make_features_bucketize():
    input_cols = [get_bucketize_col(col) for col in REL_COLUMNS]
    return input_cols
# TEST : Can have categorical columns with vocabulary list as well here.

In [65]:
REL_COLUMNS

['Sex_numeric', 'Pclass']

In [66]:
REL_COLUMNS_INCL_PREDICTOR = REL_COLUMNS.copy()
REL_COLUMNS_INCL_PREDICTOR.append('Survived')
print(REL_COLUMNS_INCL_PREDICTOR)

['Sex_numeric', 'Pclass', 'Survived']


In [67]:
def print_accuracy(model, df):
  metrics = model.evaluate(input_fn = make_eval_input_fn(df[REL_COLUMNS_INCL_PREDICTOR]))
  print('Accuracy on dataset = {}'.format(metrics['accuracy']))


In [68]:
def get_accuracy(model, df, relevant_cols):
  metrics = model.evaluate(input_fn = make_eval_input_fn(df[relevant_cols]))
  return metrics['accuracy']

In [69]:
## TEST Do cross validation.
## TEST Generalize for all columns , including ones which require imputation.
complete_train_data[2:].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_numeric
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1


In [70]:
complete_train_data[:3].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_numeric
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0


In [71]:
len(complete_train_data)

891

In [72]:
complete_train_data[1:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_numeric
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0


In [73]:
complete_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_numeric
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [74]:
len(pd.concat([complete_train_data[1:2], complete_train_data[3:]]))

889

In [75]:
NUM_CROSS_VALIDATIONS = 10

In [76]:
def get_cross_val_score(complete_train_data, 
                        num_cross_validations, 
                        relevant_cols, 
                        model_type='Linear',
                        impute_with_mean_cols=[],
                        dnn_hidden_units=[]):
    num_validation_entries = int(len(complete_train_data)/num_cross_validations)
    import shutil
    OUTDIR = 'classification_outputs'
    def internal_make_features_linear():
        input_cols = [tf.feature_column.numeric_column(col) for col in relevant_cols if col != LABEL]
        return input_cols

    def internal_make_features_bucketize():
        input_cols = [get_bucketize_col(col) for col in relevant_cols if col != LABEL]
        return input_cols
    
    sum_accuracy = 0
    for i in range(num_cross_validations):
        validation_start_index = i * num_validation_entries
        validation_end_index = validation_start_index + num_validation_entries
        validation_data = complete_train_data[validation_start_index:validation_end_index].copy()
        train_data = \
            pd.concat([complete_train_data[0:validation_start_index], 
                       complete_train_data[validation_end_index:]]).copy()

        #print(train_data[relevant_cols].isnull().values.any())
        #print(train_data[relevant_cols].isnull().any())        
        for col in impute_with_mean_cols:
            train_data[col].fillna((train_data[col].mean()), inplace=True)
            validation_data[col].fillna((train_data[col].mean()), inplace=True)            
            #train_data[col].fillna((32), inplace=True)
        #print(train_data.isnull().values.any())
        #print(train_data.isnull().any())                
        tf.logging.set_verbosity(tf.logging.ERROR)
        shutil.rmtree(OUTDIR, ignore_errors=True)
        if model_type == 'BoostedTrees':
            model = tf.estimator.BoostedTreesClassifier(feature_columns=internal_make_features_bucketize(), 
                                                        model_dir=OUTDIR,
                                                        n_batches_per_layer=1)
        elif model_type == 'Linear':
            model = tf.estimator.LinearClassifier(feature_columns=internal_make_features_linear(), 
                                                  model_dir=OUTDIR)
        elif model_type == 'DNN':
            model = tf.estimator.DNNClassifier(feature_columns=internal_make_features_linear(),
                                               hidden_units=dnn_hidden_units,
                                               model_dir=OUTDIR)
            
        model.train(input_fn=make_train_input_fn(train_data[relevant_cols], num_epochs=100))    

        accuracy = get_accuracy(model, validation_data, relevant_cols)
        sum_accuracy += accuracy


    print('Cross val score is {}', sum_accuracy/NUM_CROSS_VALIDATIONS)

In [77]:
REL_COLUMNS

['Sex_numeric', 'Pclass']

In [39]:
get_cross_val_score(complete_train_data, 
                    NUM_CROSS_VALIDATIONS, 
                    ['Sex_numeric', 'Pclass', 'Survived'],
                    'Linear')

KeyboardInterrupt: 

In [None]:
get_cross_val_score(complete_train_data, 
                    NUM_CROSS_VALIDATIONS, 
                    ['Sex_numeric', 'Pclass', 'Survived'],
                    'BoostedTrees')

In [None]:
bounda = [x for x in np.arange(0, 80, 10)].append(100)
bounda

In [None]:
bounda

### Adding Age
Here , we will need to take care of imputing missing values.

In [None]:
get_cross_val_score(complete_train_data, 
                    NUM_CROSS_VALIDATIONS, 
                    ['Sex_numeric', 'Pclass', 'Survived', 'Age'],
                    'BoostedTrees',
                    ['Age'])

In [None]:
get_cross_val_score(complete_train_data, 
                    NUM_CROSS_VALIDATIONS, 
                    ['Sex_numeric', 'Pclass', 'Survived', 'Age'],
                    'Linear',
                    ['Age'])

In [None]:
import matplotlib.pyplot as plt
complete_train_data['Age'].plot(kind='hist')
plt.show()

In [None]:
complete_train_data.columns

In [None]:
np.arange(0, 80, 10)

In [None]:
complete_train_data['Fare'].describe()

In [None]:
complete_train_data['Fare'].isnull().values.any()


## Adding Fare

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,1,figsize=(16, 9))

complete_train_data['Fare'].plot(kind='hist', ax=ax)
ax.set_xticks(np.arange(0, 150, 10))
plt.show()

In [None]:
get_cross_val_score(complete_train_data, 
                    NUM_CROSS_VALIDATIONS, 
                    ['Sex_numeric', 'Pclass', 'Survived', 'Age', 'Fare'],
                    'BoostedTrees',
                    ['Age'])

In [None]:
get_cross_val_score(complete_train_data, 
                    NUM_CROSS_VALIDATIONS, 
                    ['Sex_numeric', 'Pclass', 'Survived', 'Age', 'Fare'],
                    'Linear',
                    ['Age'])

## Adding Parch, SibSp

In [None]:
complete_train_data['Parch'].isnull().values.any()


In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,1,figsize=(16, 9))

complete_train_data['Parch'].plot(kind='hist', ax=ax)
#ax.set_xticks(np.arange(0, 150, 10))
plt.show()

In [None]:
complete_train_data['Parch'].describe()

In [None]:
complete_train_data['SibSp'].isnull().values.any()


In [None]:
complete_train_data['SibSp'].describe()

In [None]:
get_cross_val_score(complete_train_data, 
                    NUM_CROSS_VALIDATIONS, 
                    ['Sex_numeric', 'Pclass', 'Survived', 'Age', 'Fare', 'Parch', 'SibSp'],
                    'BoostedTrees',
                    ['Age'])

In [None]:
get_cross_val_score(complete_train_data, 
                    NUM_CROSS_VALIDATIONS, 
                    ['Sex_numeric', 'Pclass', 'Survived', 'Age', 'Fare', 'Parch', 'SibSp'],
                    'Linear',
                    ['Age'])

## Try DNN's now

In [None]:
get_cross_val_score(complete_train_data, 
                    NUM_CROSS_VALIDATIONS, 
                    ['Sex_numeric', 'Pclass', 'Survived', 'Age', 'Fare', 'Parch', 'SibSp'],
                    'DNN',
                    ['Age'],
                    [10, 10, 1])

### Summary

There does not look to be much of a benefit from tensorflow usage, possibly due to lack of data. Deep learning techniques are much better suited for heavy data problems.

However, for the sake of completion, let us make predictions using the model yieldling the best cross validation score and see how it fares in the training set.

### Making predictions

In [136]:
final_cols = ['Sex_numeric', 'Pclass', 'Survived', 'Age', 'Fare', 'Parch', 'SibSp']
OUTDIR = 'classification_outputs'
def temp_make_features_linear():
    print(final_cols)
    return [tf.feature_column.numeric_column(col) 
            for col in final_cols if col != LABEL]    
import shutil
shutil.rmtree(OUTDIR, ignore_errors=True)

model = tf.estimator.LinearClassifier(feature_columns=temp_make_features_linear(), 
                                      model_dir=OUTDIR)
new_complete_data = complete_train_data.copy()
new_complete_data['Age'].fillna((new_complete_data['Age'].mean()), inplace=True)

model.train(input_fn=make_train_input_fn(new_complete_data[final_cols], num_epochs=100))    


['Sex_numeric', 'Pclass', 'Survived', 'Age', 'Fare', 'Parch', 'SibSp']


<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x10d5bc080>

In [137]:
complete_test_data = pd.read_csv('../input/test.csv')


In [138]:
complete_test_data['Sex_numeric'] = 1
complete_test_data.loc[complete_test_data['Sex'] == 'female','Sex_numeric'] = 0

In [139]:
complete_test_data['Age'].fillna((complete_train_data['Age'].mean()), inplace=True)
complete_test_data['Fare'].fillna((complete_train_data['Fare'].mean()), inplace=True)

In [140]:
complete_test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex_numeric'],
      dtype='object')

In [83]:
predictor_cols = ['Sex_numeric', 'Pclass',  'Age', 'Fare', 'Parch', 'SibSp']
complete_test_data[predictor_cols].isnull().any()


Sex_numeric    False
Pclass         False
Age            False
Fare           False
Parch          False
SibSp          False
dtype: bool

In [141]:
predictions = model.predict(
    input_fn=make_prediction_input_fn(complete_test_data[predictor_cols]))

In [None]:
predictions

In [85]:
print(len(predictions))

TypeError: object of type 'generator' has no len()

In [87]:
print(len(complete_test_data))

418


In [134]:
print(len(complete_test_data[predictor_cols]))

418


In [99]:

for i in predictions:
    print(i)

In [None]:
print(l)

In [95]:
complete_test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_numeric
0,892,3,"Kelly, Mr. James",male,34.500000,0,0,330911,7.8292,,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.000000,1,0,363272,7.0000,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.000000,0,0,240276,9.6875,,Q,1
3,895,3,"Wirz, Mr. Albert",male,27.000000,0,0,315154,8.6625,,S,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.000000,1,1,3101298,12.2875,,S,0
5,897,3,"Svensson, Mr. Johan Cervin",male,14.000000,0,0,7538,9.2250,,S,1
6,898,3,"Connolly, Miss. Kate",female,30.000000,0,0,330972,7.6292,,Q,0
7,899,2,"Caldwell, Mr. Albert Francis",male,26.000000,1,1,248738,29.0000,,S,1
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.000000,0,0,2657,7.2292,,C,0
9,901,3,"Davies, Mr. John Samuel",male,21.000000,2,0,A/4 48871,24.1500,,S,1


In [142]:
test_result = pd.DataFrame(columns=['PassengerId', 'Survived'])

In [143]:
test_result['PassengerId'] = complete_test_data['PassengerId']

In [144]:
expected = [0, 1]
count = 0
for pred_dict in predictions:
    passenger_id = complete_test_data['PassengerId'][count]
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]


    test_result.loc[count, 'Survived'] = class_id
    count += 1


In [146]:
count

418

In [145]:
test_result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [147]:
test_result.to_csv('tensorflow_out.csv', index=False)