In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from pathlib import Path

In [2]:
data_path = Path('data')

train_path = data_path / 'train.csv'
test_path = data_path / 'test.csv'

In [3]:
train_data = pd.read_csv(str(train_path))
test_data = pd.read_csv(str(test_path))

In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
y = train_data['Survived']
train_data = train_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis=1)

In [6]:
train_data.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [7]:
# Fill missing age values with average of all ages

age_average = train_data['Age'].mean()
train_data['Age'] = train_data['Age'].fillna(age_average)

In [8]:
# Fill missing embarked with most common values

most_common = train_data['Embarked'].value_counts().index[0]
train_data['Embarked'] = train_data['Embarked'].fillna(most_common)

In [9]:
train_data.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [10]:
train_data['Sex'] = train_data['Sex'].map({'male':0, 'female':1})

# Normalize numeric columns

col_data = train_data['Fare']
train_data['Fare'] = (col_data - col_data.min())/(col_data.max() - col_data.min())

col_data = train_data['Age']
train_data['Age'] = (col_data - col_data.min())/(col_data.max() - col_data.min())

col_data = train_data['Parch']
train_data['Parch'] = (col_data - col_data.min())/(col_data.max() - col_data.min())

col_data = train_data['SibSp']
train_data['SibSp'] = (col_data - col_data.min())/(col_data.max() - col_data.min())

In [11]:
train_data = pd.get_dummies(train_data, columns=['Embarked'], prefix='emb_')
train_data = pd.get_dummies(train_data, columns=['Pclass'], prefix='pclass_')

In [12]:
train_data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,emb__C,emb__Q,emb__S,pclass__1,pclass__2,pclass__3
0,0,0.271174,0.125,0.0,0.014151,0,0,1,0,0,1
1,1,0.472229,0.125,0.0,0.139136,1,0,0,1,0,0
2,1,0.321438,0.0,0.0,0.015469,0,0,1,0,0,1
3,1,0.434531,0.125,0.0,0.103644,0,0,1,1,0,0
4,0,0.434531,0.0,0.0,0.015713,0,0,1,0,0,1


In [13]:
# Train NN
import torch
from torch import nn


class Model(nn.Module):
    def __init__(self, input_shape):
        super(Model, self).__init__()
        self.linear1 = nn.Linear(input_shape, 7)
        self.linear2 = nn.Linear(7, 3)
        self.linear3 = nn.Linear(3, 1)
        
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        
        x = self.linear2(x)
        x = self.relu(x)
        
        x = self.linear3(x)        
        x = self.sigmoid(x)
        
        return x
    

In [14]:
input_data = torch.Tensor(train_data.to_numpy())
input_y = torch.Tensor(y.to_numpy())
input_y = input_y.unsqueeze(1)

In [15]:
loss_func = nn.BCELoss()
model = Model(input_data.shape[1])

In [16]:
def predict(data_input, training=False):
    if training:
        model.train()
    else:
        model.eval()
    y_pred = model(data_input)
    if not training:
        y_pred = (y_pred >= 0.5).float()
        
    return y_pred

In [17]:
def validate(data_input, truth_y):
    y_pred = predict(data_input, training=False)
    total = data_input.shape[0]
    correct = (y_pred == truth_y).sum().item()
    acc = correct / total
    return acc

In [18]:
optimizer = torch.optim.SGD(model.parameters(), lr=.1)

In [19]:
EPOCHS = 1000000
for epoch in range(1, EPOCHS+1):
    model.train()
    y_pred = predict(input_data, training=True)
    loss = loss_func(y_pred, input_y)
    acc = validate(input_data, input_y)
    if epoch % (EPOCHS // 10) == 0:
        print(f'Epoch: {epoch}, Loss: {loss.item():5f}, Acc: {acc:5f}')
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch: 100000, Loss: 0.362100, Acc: 0.846240
Epoch: 200000, Loss: 0.338527, Acc: 0.864198
Epoch: 300000, Loss: 0.339466, Acc: 0.864198
Epoch: 400000, Loss: 0.337498, Acc: 0.865320
Epoch: 500000, Loss: 0.338632, Acc: 0.861953
Epoch: 600000, Loss: 0.335911, Acc: 0.863075
Epoch: 700000, Loss: 0.336057, Acc: 0.863075
Epoch: 800000, Loss: 0.337047, Acc: 0.861953
Epoch: 900000, Loss: 0.336098, Acc: 0.861953
Epoch: 1000000, Loss: 0.335809, Acc: 0.861953


In [29]:
test_data = pd.read_csv(str(test_path))
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [30]:
test_passenger_id = test_data['PassengerId']
test_data = test_data.drop(columns=['PassengerId', 'Cabin', 'Ticket', 'Name'], axis=1, errors='ignore')
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mean())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())

In [31]:
# Preprocess

test_data['Sex'] = test_data['Sex'].map({'male':0, 'female':1})

# Normalize numeric columns

col_data = test_data['Fare']
test_data['Fare'] = (col_data - col_data.min())/(col_data.max() - col_data.min())

col_data = test_data['Age']
test_data['Age'] = (col_data - col_data.min())/(col_data.max() - col_data.min())

col_data = test_data['Parch']
test_data['Parch'] = (col_data - col_data.min())/(col_data.max() - col_data.min())

col_data = test_data['SibSp']
test_data['SibSp'] = (col_data - col_data.min())/(col_data.max() - col_data.min())

# Encode to one-hot
test_data = pd.get_dummies(test_data, columns=['Embarked'], prefix='emb_')
test_data = pd.get_dummies(test_data, columns=['Pclass'], prefix='pclass_')

In [32]:
test_data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,emb__C,emb__Q,emb__S,pclass__1,pclass__2,pclass__3
0,0,0.452723,0.0,0.0,0.015282,0,1,0,0,0,1
1,1,0.617566,0.125,0.0,0.013663,0,0,1,0,0,1
2,0,0.815377,0.0,0.0,0.018909,0,1,0,0,1,0
3,0,0.353818,0.0,0.0,0.016908,0,0,1,0,0,1
4,1,0.287881,0.125,0.111111,0.023984,0,0,1,0,0,1


In [33]:
test_input_x = torch.Tensor(test_data.to_numpy())

In [34]:
test_data.isnull().sum()

Sex          0
Age          0
SibSp        0
Parch        0
Fare         0
emb__C       0
emb__Q       0
emb__S       0
pclass__1    0
pclass__2    0
pclass__3    0
dtype: int64

In [42]:
results = predict(test_input_x).squeeze()

In [43]:
results_df = pd.DataFrame({'PassengerId':test_passenger_id, 'Survived':list(results.numpy().astype(np.int32))})
results_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [44]:
results_df.to_csv('output.csv', index=False)