In [1]:
import pandas as pd
import numpy as np
import scipy
from sklearn import preprocessing
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class TitanicDataset(Dataset):
    def __init__(self, csv_file, train=True):
        self.train = train
        
        df = pd.read_csv(csv_file, header=0)

        def proc_cabin(cabin_str):
            if pd.isna(cabin_str):
                return '?'
            return cabin_str[0]

        df['cabin_proc'] = df['Cabin'].apply(proc_cabin)        

        fare_mean = df['Fare'].median()

        def proc_fare(fare_val):
            if fare_val == 0:
                return fare_mean
            return fare_val

        df['fare_proc'] = df['Fare'].apply(proc_fare)
        
        age_mean = df['Age'].mean()
        df['age_proc'] = df[['Age']].fillna(age_mean)

        df2 = pd.concat([
            df,
            pd.get_dummies(df['Pclass'], prefix='pclass'), 
            pd.get_dummies(df['Sex']),
            pd.get_dummies(df['Embarked'], prefix='embarked'),
            pd.get_dummies(df['cabin_proc'], prefix='cabin'),
        ], axis='columns')

        features = [
            'age_proc', 'SibSp', 'Parch', 'pclass_1', 'pclass_2', 'pclass_3', 
            'female', 'male',
            'embarked_C', 'embarked_Q', 'embarked_S', 'fare_proc',
            'cabin_?', 'cabin_A', 'cabin_B', 'cabin_C', 'cabin_D', 'cabin_E', 'cabin_F', 'cabin_G'
        ]

        df3 = df2[features]
        
        df3_scaled = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(df3))
        tensor = torch.tensor(df3_scaled.values)
        
        self.tensor_inputs = tensor.float()
        self.tensor_ids = torch.tensor(df['PassengerId'].values)

        if train:
            df3_label = pd.get_dummies(df['Survived'], prefix='survived')
            self.tensor_labels = torch.tensor(df3_label.values).long()
        
    def __len__(self):
        return len(self.tensor_inputs)
    
    def __getitem__(self, idx):
        if self.train:
            return self.tensor_ids[idx], self.tensor_inputs[idx], self.tensor_labels[idx]
        else:
            return self.tensor_ids[idx], self.tensor_inputs[idx]

In [3]:
dataset = TitanicDataset('data/train.csv')
submit_dataset = TitanicDataset('data/test.csv', False)

feature_size = len(dataset[0][1])

batch_size = 25
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
submit_loader = torch.utils.data.DataLoader(submit_dataset, batch_size=batch_size, shuffle=False)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [4]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(feature_size, 50)
        self.fc2 = nn.Linear(50, 2)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

In [5]:
device = torch.device("cuda")

net = Net()
net = net.to(device)
print(net)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters())

for epoch in range(1000):
    running_loss = 0.0
    records = 0
    for data in train_loader:

        _, inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        inputs = inputs[:,:,None]
        inputs = inputs.permute(0,2,1)

        optimizer.zero_grad()
        outputs = net(inputs)

        loss = criterion(outputs[:,0], torch.max(labels, 1)[1])
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        records += len(data)

    if epoch % 10 == 0:
        print("[%d] loss: %.3f" % (epoch + 1, running_loss / records))


Net(
  (fc1): Linear(in_features=20, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=2, bias=True)
)
[1] loss: 0.230
[11] loss: 0.168
[21] loss: 0.164
[31] loss: 0.161
[41] loss: 0.161
[51] loss: 0.159
[61] loss: 0.160
[71] loss: 0.159
[81] loss: 0.159
[91] loss: 0.158
[101] loss: 0.157
[111] loss: 0.157
[121] loss: 0.157
[131] loss: 0.157
[141] loss: 0.156
[151] loss: 0.156
[161] loss: 0.155
[171] loss: 0.154
[181] loss: 0.155
[191] loss: 0.155
[201] loss: 0.155
[211] loss: 0.155
[221] loss: 0.153
[231] loss: 0.153
[241] loss: 0.153
[251] loss: 0.153
[261] loss: 0.152
[271] loss: 0.153
[281] loss: 0.152
[291] loss: 0.152
[301] loss: 0.152
[311] loss: 0.153
[321] loss: 0.152
[331] loss: 0.152
[341] loss: 0.151
[351] loss: 0.151
[361] loss: 0.151
[371] loss: 0.150
[381] loss: 0.151
[391] loss: 0.151
[401] loss: 0.150
[411] loss: 0.150
[421] loss: 0.150
[431] loss: 0.150
[441] loss: 0.151
[451] loss: 0.150
[461] loss: 0.150
[471] loss: 0.150
[481] loss: 0.150
[49

In [6]:
def calculate_accuracy(dataloader):
    correct = 0
    total = 0

    with torch.no_grad():
        for data in dataloader:
            _, inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            inputs = inputs[:,:,None]
            inputs = inputs.permute(0,2,1)

            outputs = net(inputs)

            predicted = torch.max(outputs[:,0], 1)[1]
            expected = torch.max(labels, 1)[1]

            total += predicted.size(0)
            correct += (predicted == expected).sum().item()

    print(correct / total)

calculate_accuracy(train_loader)
calculate_accuracy(test_loader)

0.8623595505617978
0.8324022346368715


In [7]:
def get_predictions(dataloader):
    res = []
    with torch.no_grad():
        for data in dataloader:
            ids, inputs = data
            inputs = inputs.to(device)

            inputs = inputs[:,:,None]
            inputs = inputs.permute(0,2,1)

            outputs = net(inputs)

            predicted = torch.max(outputs[:,0], 1)[1].to('cpu')
            
            res += list(zip(ids.numpy(), predicted.numpy()))
    return res
            
res = get_predictions(submit_loader)
df_res = pd.DataFrame(res, columns=['PassengerId', 'Survived'])
df_res.to_csv(path_or_buf='data/predictions.csv', index=False)