In [285]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import math

In [286]:
sns.set_theme()

In [287]:
train_data = pd.read_csv('train.csv')

In [288]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Data format
- Passenger id : int uuid
- Survived 0 or 1
- Pclass : 1, 2 or 3
- Sex : 'male' or 'female' **String only**
- Age : has **nan** values, should replace by median age of the rest of the dataset
- SibSp : 0,1,2,3,4,5,8
- Parch : 0,1,2,3,4,5,6
- Ticket : Number or string + number or just string
- Fare : Number
- Cabin : String + Number or **nan**, some have duplicate entries
- Embarked : 'S', 'C', 'Q' or **nan**, **String only**

In [289]:
avg_age = math.floor(train_data['Age'].mean())
train_data['Age'] = train_data['Age'].fillna(avg_age)

In [290]:
train_data['Embarked'] = train_data['Embarked'].fillna('U')

In [291]:
## remove columns that I don't feel like parsing atm

train_data.drop('Cabin', inplace = True, axis = 1)
train_data.drop('Ticket', inplace=True, axis = 1)
train_data.drop('Name', inplace=True, axis=1)
train_data.drop('PassengerId', inplace=True, axis = 1)

## Encode categorical attributes

In [292]:
one_hot_sex = pd.get_dummies(train_data['Sex'])  # haha !
one_hot_Embarked = pd.get_dummies(train_data['Embarked'])
train_data.drop('Sex', inplace=True, axis=1)
train_data.drop('Embarked', inplace=True, axis=1)

In [293]:
train_data = train_data.join(one_hot_sex)
train_data = train_data.join(one_hot_Embarked)

In [294]:
train_data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S,U
0,0,3,22.0,1,0,7.25,0,1,0,0,1,0
1,1,1,38.0,1,0,71.2833,1,0,1,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1,0
3,1,1,35.0,1,0,53.1,1,0,0,0,1,0
4,0,3,35.0,0,0,8.05,0,1,0,0,1,0


In [295]:
y_train = train_data.pop('Survived')
X_train = train_data


In [296]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [297]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S,U
0,3,22.0,1,0,7.25,0,1,0,0,1,0
1,1,38.0,1,0,71.2833,1,0,1,0,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1,0
3,1,35.0,1,0,53.1,1,0,0,0,1,0
4,3,35.0,0,0,8.05,0,1,0,0,1,0


In [298]:
bad_data = X_train[X_train.isna().any(axis=1)]

In [299]:
bad_data.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S,U


In [302]:
X_train.isna().any()

Pclass    False
Age       False
SibSp     False
Parch     False
Fare      False
female    False
male      False
C         False
Q         False
S         False
U         False
dtype: bool

## Convert to Tensorflow model input

In [222]:
X_train = tf.convert_to_tensor(X_train)
y_train = tf.convert_to_tensor(y_train)

In [223]:
X_train.shape

TensorShape([891, 11])

In [205]:
# normalizer = tf.keras.layers.Normalization(axis=-1)
# normalizer.adapt(X_train)

In [224]:
X_train = tf.linalg.normalize(
    X_train, ord='euclidean', axis = -1
)

In [227]:
X_train = X_train[0]

In [229]:
train_ds = tf.data.Dataset.from_tensors((X_train,y_train))

## Create Model

In [235]:
def create_non_seq_model():
    input_layer = tf.keras.layers.Input(shape=[11,])
    hidden1 = tf.keras.layers.Dense(10, activation='relu')(input_layer)
    hidden2 = tf.keras.layers.Dense(10, activation='relu')(hidden1)
    concat = tf.keras.layers.Concatenate()([input_layer, hidden2])
    output = tf.keras.layers.Dense(1, activation='softmax')(concat)
    model = tf.keras.Model(inputs=[input_layer], outputs=[output])
    return model

In [236]:
m = create_non_seq_model()

In [237]:
loss_fn = tf.keras.losses.MeanSquaredError()
m.compile(optimizer='adam', loss=loss_fn)

In [238]:
m.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 11)]         0           []                               
                                                                                                  
 dense_9 (Dense)                (None, 10)           120         ['input_4[0][0]']                
                                                                                                  
 dense_10 (Dense)               (None, 10)           110         ['dense_9[0][0]']                
                                                                                                  
 concatenate_3 (Concatenate)    (None, 21)           0           ['input_4[0][0]',                
                                                                  'dense_10[0][0]']         

## Training

In [239]:
m.fit(train_ds, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x151af12ec10>

## On test data

In [252]:
test_data = pd.read_csv('test.csv')

In [253]:
avg_age = math.floor(train_data['Age'].mean())
test_data['Age'] = test_data['Age'].fillna(avg_age)

In [254]:
test_data['Embarked'] = test_data['Embarked'].fillna('U')


In [255]:
one_hot_sex = pd.get_dummies(test_data['Sex'])  # haha !
one_hot_Embarked = pd.get_dummies(test_data['Embarked'])
test_data.drop('Sex', inplace=True, axis=1)
test_data.drop('Embarked', inplace=True, axis=1)

In [256]:
test_data = test_data.join(one_hot_sex)
test_data = test_data.join(one_hot_Embarked)

In [257]:
test_data.drop('Cabin', inplace = True, axis = 1)
test_data.drop('Ticket', inplace=True, axis = 1)
test_data.drop('Name', inplace=True, axis=1)
test_data.drop('PassengerId',inplace=True, axis=1)

In [258]:
test_data['U'] = 0

In [259]:
test_data.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S,U
0,3,34.5,0,0,7.8292,0,1,0,1,0,0
1,3,47.0,1,0,7.0,1,0,0,0,1,0
2,2,62.0,0,0,9.6875,0,1,0,1,0,0
3,3,27.0,0,0,8.6625,0,1,0,0,1,0
4,3,22.0,1,1,12.2875,1,0,0,0,1,0


In [260]:
X_test = tf.convert_to_tensor(test_data)
X_test =tf.linalg.normalize(
    X_test, ord = 'euclidean', axis = -1
)

In [261]:
X_test = X_test[0]
X_test.shape

TensorShape([418, 11])

In [262]:
test_ds = tf.data.Dataset.from_tensors(X_test)

In [263]:
preds = m.predict(test_ds)

In [264]:
preds

array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
      