In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

sns.set(style="whitegrid")
%matplotlib inline

In [4]:
train = pd.read_csv('../data/titanic/train.csv')
test = pd.read_csv('../data/titanic/test.csv')
train.shape, test.shape

((891, 12), (418, 11))

In [5]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### Split data

In [6]:
label_name = "Survived"

from sklearn.model_selection import train_test_split

X_train_raw,  X_valid_raw, y_train_raw, y_valid_raw = train_test_split(
    train.drop(label_name, axis=1), train[label_name], test_size=0.33, random_state=42
)

In [7]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore')
X_train = ohe.fit_transform(X_train_raw).toarray()
X_valid = ohe.transform(X_valid_raw).toarray()
X_test  = ohe.transform(test).toarray()
# to Tensor
X_train = torch.tensor(X_train)
X_valid = torch.tensor(X_valid)
X_test  = torch.tensor(X_test)

X_train.shape

torch.Size([596, 2080])

In [9]:
# label 1D => 2D
y_train = torch.Tensor(y_train_raw.values).unsqueeze(-1)
y_valid = torch.Tensor(y_valid_raw.values).unsqueeze(-1)
print(y_train.shape, y_valid.shape)
y_train[:5]

torch.Size([596, 1]) torch.Size([295, 1])


tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.]])

In [10]:
y_train.shape

torch.Size([596, 1])

In [11]:
# Model
class LogisticRegression(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.linear(x)
        x = self.sigmoid(x)
        return x
    
input_size = X_train.shape[1]
model = LogisticRegression(input_size)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [13]:
#train
num_epochs  = 1000
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train.float())
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 50 == 0:
        print(f"Epoch[ {epoch + 1} / {num_epochs} ], Loss: {loss.item()}")
    

Epoch[ 50 / 1000 ], Loss: 0.15321438014507294
Epoch[ 100 / 1000 ], Loss: 0.06772556900978088
Epoch[ 150 / 1000 ], Loss: 0.039705827832221985
Epoch[ 200 / 1000 ], Loss: 0.026668325066566467
Epoch[ 250 / 1000 ], Loss: 0.01940826326608658
Epoch[ 300 / 1000 ], Loss: 0.014892512932419777
Epoch[ 350 / 1000 ], Loss: 0.011864231899380684
Epoch[ 400 / 1000 ], Loss: 0.009719640016555786
Epoch[ 450 / 1000 ], Loss: 0.008136752992868423
Epoch[ 500 / 1000 ], Loss: 0.00692994799464941
Epoch[ 550 / 1000 ], Loss: 0.00598552031442523
Epoch[ 600 / 1000 ], Loss: 0.005230376962572336
Epoch[ 650 / 1000 ], Loss: 0.004615624435245991
Epoch[ 700 / 1000 ], Loss: 0.004107463639229536
Epoch[ 750 / 1000 ], Loss: 0.003681859700009227
Epoch[ 800 / 1000 ], Loss: 0.003321309108287096
Epoch[ 850 / 1000 ], Loss: 0.003012800822034478
Epoch[ 900 / 1000 ], Loss: 0.0027464739978313446
Epoch[ 950 / 1000 ], Loss: 0.0025147476699203253
Epoch[ 1000 / 1000 ], Loss: 0.002311699790880084


In [14]:
# valid
with torch.no_grad():
    outputs = model(X_valid.float())
    y_valid_predict = (outputs >= 0.5).float()
    y_valid_predict = y_valid_predict.squeeze()
    
(y_valid.squeeze() == y_valid_predict).detach().numpy().mean()

0.7796610169491526

In [15]:
# predict
# test
with torch.no_grad():
    outputs = model(X_test.float())
    y_predict = (outputs >= 0.5).float()
    y_predict = y_predict.squeeze().detach().numpy()
    
y_predict[:10]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.], dtype=float32)

In [17]:
pd.Series(y_predict).value_counts()

0.0    271
1.0    147
Name: count, dtype: int64

In [18]:
submit = pd.read_csv("../data/titanic/gender_submission.csv")
submit.head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1


In [19]:
submit["Survived"] = y_predict.astype(int)

In [21]:
submit.to_csv("../submissions/titanic_submission.csv", index=False)
pd.read_csv("../submissions/titanic_submission.csv").head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
