In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import sklearn.preprocessing as skprep

import os.path as path

# Data Exploration

In [2]:
DATAROOT = path.expanduser("~/mldata/titanic")

In [3]:
DATAROOT = path.join("D:", "mldata", "titanic")

In [4]:
train_df = pd.read_csv(path.join(DATAROOT, "train.csv"), index_col="PassengerId")
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Identify NA/NULL cells

First order of business, lets see how many NULL or NA values we have.

In [5]:
train_df[train_df.isna().any(axis=1)]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


Too many to just eyeball. Let me count the number of NAs in each column.

In [6]:
for colname in train_df.columns:
    num_nas = np.sum(train_df[colname].isna())
    print(colname, num_nas)

Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2


Clearly Cabin and Age as feature are pretty much useless. They have to go. Embarked has only 2 NAs, I can keep the rows replacing the values with some other string, or I can just delete the rows. I'll go with deleting the rows for now.

In [7]:
train_0_df = train_df
train_df = train_0_df[["Survived", "Pclass", "Name", "Sex", "SibSp", "Parch", "Ticket", "Fare", "Embarked"]]
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C
3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,S
5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,S


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Embarked  889 non-null    object 
dtypes: float64(1), int64(4), object(4)
memory usage: 69.6+ KB


In [9]:
train_1_df = train_df
train_df = train_1_df[train_1_df.Embarked.notna()]
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C
3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,S
5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,S


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 1 to 891
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Name      889 non-null    object 
 3   Sex       889 non-null    object 
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Ticket    889 non-null    object 
 7   Fare      889 non-null    float64
 8   Embarked  889 non-null    object 
dtypes: float64(1), int64(4), object(4)
memory usage: 69.5+ KB


## Identify High Cardinality Features

Not all high cardinality features are bad. I expect numerical features to have high cardinality. But if categorical features have high cardinality that deserves further digging.

In [11]:
train_df.apply(pd.Series.nunique)

Survived      2
Pclass        3
Name        889
Sex           2
SibSp         7
Parch         7
Ticket      680
Fare        247
Embarked      3
dtype: int64

`Name` and `Ticket` are useless because of their high cardinality. Lets get rid or them.

In [12]:
train_2_df = train_df
train_df = train_2_df[["Survived", "Pclass", "Sex", "SibSp", "Parch", "Fare", "Embarked"]]
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,1,0,7.25,S
2,1,1,female,1,0,71.2833,C
3,1,3,female,0,0,7.925,S
4,1,1,female,1,0,53.1,S
5,0,3,male,0,0,8.05,S


## Identify Correlated Features

First lets correlate all the input features against the target `Survived`. Features with a high correlation are good because they have good predictive power. Know about features with little or no correlation, this knowledge can be useful later on.

In [13]:
train_df.groupby(["Survived", "Pclass"]).size()

Survived  Pclass
0         1          80
          2          97
          3         372
1         1         134
          2          87
          3         119
dtype: int64

`Pclass` seems to have good predictive power, if somebody was travelling 3rd class they likely didn't survive, if they were travelling first class they likely did. Good feature to keep around.

In [14]:
train_df.groupby(["Survived", "Sex"]).size()

Survived  Sex   
0         female     81
          male      468
1         female    231
          male      109
dtype: int64

Another good feature to keep.

In [15]:
train_df.groupby(["Survived", "SibSp"]).size()

Survived  SibSp
0         0        398
          1         97
          2         15
          3         12
          4         15
          5          5
          8          7
1         0        208
          1        112
          2         13
          3          4
          4          3
dtype: int64

No correlation. But still lets keep this feature.

In [16]:
train_df.groupby(["Survived", "Parch"]).size()

Survived  Parch
0         0        445
          1         53
          2         40
          3          2
          4          4
          5          4
          6          1
1         0        231
          1         65
          2         40
          3          3
          5          1
dtype: int64

Lets keep for now.

In [17]:
train_df.groupby(["Survived", "Embarked"]).size()

Survived  Embarked
0         C            75
          Q            47
          S           427
1         C            93
          Q            30
          S           217
dtype: int64

Lets keep.

In [18]:
train_df.groupby("Survived").agg({"Fare": [np.mean, np.std]})

Unnamed: 0_level_0,Fare,Fare
Unnamed: 0_level_1,mean,std
Survived,Unnamed: 1_level_2,Unnamed: 2_level_2
0,22.117887,31.388207
1,48.209498,66.748773


Good correlation, people who paid a lower fare had less chances of surviving.

Now lets find features that are correlated with each other. In most cases I'd want to keep only of two correlated features.

In [19]:
train_df.groupby("Pclass").agg({"Fare": [np.mean, np.std]})

Unnamed: 0_level_0,Fare,Fare
Unnamed: 0_level_1,mean,std
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2
1,84.193516,78.746457
2,20.662183,13.417399
3,13.67555,11.778142


Nothing really specialy jumps out. Lets keep the dataframe as is. No changes.

In [20]:
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,1,0,7.25,S
2,1,1,female,1,0,71.2833,C
3,1,3,female,0,0,7.925,S
4,1,1,female,1,0,53.1,S
5,0,3,male,0,0,8.05,S


# Feature Engineering

Need to do three things:

  1. Binarize Sex into 0/1.
  2. Convert Embarked into one-hot vector
  3. Normalize Fare

In [21]:
sex_enc = skprep.LabelEncoder()
sex_enc.fit(train_df.Sex)
sex_enc.classes_

array(['female', 'male'], dtype=object)

In [22]:
encoded_sex = sex_enc.transform(train_df.Sex)
encoded_sex[:5]

array([1, 0, 0, 0, 1])

In [23]:
embarked_enc = skprep.LabelBinarizer()
embarked_enc.fit(train_df.Embarked)
embarked_enc.classes_

array(['C', 'Q', 'S'], dtype='<U1')

In [24]:
encoded_embarked = embarked_enc.transform(train_df.Embarked)
encoded_embarked[:5]

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1]])

In [25]:
fare_mean = train_df.Fare.mean()
fare_std = train_df.Fare.std()
print(f"Mean = {fare_mean:.3f}, Std = {fare_std:.3f}")

Mean = 32.097, Std = 49.698


In [26]:
norm_fare = ((train_df.Fare - fare_mean) / fare_std).values
norm_fare[:5]

array([-0.49995832,  0.78850276, -0.48637615,  0.42262322, -0.48386093])

In [27]:
print(np.mean(norm_fare), np.std(norm_fare))

1.3587431392487855e-16 0.9994374120511119


Now lets create the final X and y datasets.

In [28]:
y = train_df.Survived.values
y[:5]

array([0, 1, 1, 1, 0], dtype=int64)

In [29]:
X = train_df[["Pclass", "SibSp", "Parch"]]
X[:5]

Unnamed: 0_level_0,Pclass,SibSp,Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,1,0
2,1,1,0
3,3,0,0
4,1,1,0
5,3,0,0


Before I stack `encoded_sex` to `X`, I need to convert it to a column vector.

In [30]:
encoded_sex = encoded_sex.reshape(-1, 1)
X = np.concatenate((X, encoded_sex), axis=1)
X[:5]

array([[3, 1, 0, 1],
       [1, 1, 0, 0],
       [3, 0, 0, 0],
       [1, 1, 0, 0],
       [3, 0, 0, 1]], dtype=int64)

In [31]:
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,1,0,7.25,S
2,1,1,female,1,0,71.2833,C
3,1,3,female,0,0,7.925,S
4,1,1,female,1,0,53.1,S
5,0,3,male,0,0,8.05,S


`encoded_embarked` is of the right dims so lets stack it.

In [32]:
X = np.concatenate((X, encoded_embarked), axis=1)
X[:5]

array([[3, 1, 0, 1, 0, 0, 1],
       [1, 1, 0, 0, 1, 0, 0],
       [3, 0, 0, 0, 0, 0, 1],
       [1, 1, 0, 0, 0, 0, 1],
       [3, 0, 0, 1, 0, 0, 1]], dtype=int64)

`norm_fare` also needs to be converted to a column vector.

In [33]:
norm_fare = norm_fare.reshape(-1, 1)
X = np.concatenate((X, norm_fare), axis=1)
X[:5]

array([[ 3.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        , -0.49995832],
       [ 1.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.78850276],
       [ 3.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        , -0.48637615],
       [ 1.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.42262322],
       [ 3.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        , -0.48386093]])

In [34]:
X.shape

(889, 8)

# Create Datasets

In [35]:
from torch.utils.data import TensorDataset, random_split
import torch as t

In [36]:
dataset = TensorDataset(t.from_numpy(X).to(t.float32), t.from_numpy(y).to(t.float32))
dataset[:3]

(tensor([[ 3.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000, -0.5000],
         [ 1.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.7885],
         [ 3.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000, -0.4864]]),
 tensor([0., 1., 1.]))

In [37]:
train_size = int(len(dataset) * 0.9)
val_size = len(dataset) - train_size
trainset, valset = random_split(dataset, (train_size, val_size))
print(len(trainset), len(valset))

800 89


# Train Model

In [38]:
from haikunator import Haikunator
from dataclasses import dataclass

from torch.utils.data import DataLoader
import torch.nn.functional as F

from sklearn.metrics import accuracy_score

import torchutils as tu
from torchutils.ml_loggers.csv_logger import CsvMLExperiment
from torchutils.ml_loggers.stdout_logger import StdoutMLExperiment
from torchutils.visualizers.csv_metrics_visualizer import compare, analyze

In [39]:
h = Haikunator()

In [40]:
class BaselineClassifier(t.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = t.nn.Linear(8, 1)
        
    def forward(self, batch_x):
        x1 = t.sigmoid(self.fc1(batch_x))
        batch_y_hat = t.squeeze(x1, dim=1)
        return batch_y_hat

### Wire-Check

In [41]:
batch_X, batch_y = trainset[:5]
tmp_model = BaselineClassifier()

In [42]:
batch_y_hat = tmp_model(batch_X)

In [43]:
print(batch_y_hat.dtype, batch_y_hat.shape)
batch_y_hat

torch.float32 torch.Size([5])


tensor([0.4426, 0.3527, 0.4412, 0.5589, 0.3522], grad_fn=<SqueezeBackward1>)

In [44]:
print(batch_y.dtype, batch_y.shape)
batch_y

torch.float32 torch.Size([5])


tensor([0., 0., 1., 1., 0.])

In [45]:
t.nn.BCELoss()(batch_y_hat, batch_y)

tensor(0.5707, grad_fn=<BinaryCrossEntropyBackward>)

### Define Utility Functions

In [46]:
def build_accuracy(cutoff: float):
    def accuracy(y_true, y_hat):
        y_pred = (y_hat > cutoff).to(t.float32)
        return accuracy_score(y_true, y_pred)

    return accuracy

In [47]:
model_factory = {
    "baseline": BaselineClassifier
}

In [48]:
@dataclass
class MyHyperparams(tu.Hyperparams):
    batch_size: int
    n_epochs: int
    lr: float
    model_type: str

In [49]:
def build_trainer(hparams, trainset, valset):
    run_name = h.haikunate()
    print(f"Starting run {run_name}")
    model = model_factory[hparams.model_type]()
    optim = t.optim.Adam(model.parameters(), lr=hparams.lr)
    loss_fn = t.nn.BCELoss()
    traindl = DataLoader(trainset, batch_size=hparams.batch_size, shuffle=True)
    valdl = DataLoader(valset, batch_size=89)
    return tu.TrainerArgs(
        run_name=run_name,
        model=model,
        optim=optim,
        loss_fn=loss_fn,
        trainloader=traindl,
        valloader=valdl,
        n_epochs = hparams.n_epochs
    )

In [50]:
accuracy = build_accuracy(0.5)

### Train Baseline Model

In [52]:
EXPROOT = path.join("~", "temp", "experiments")
exp = CsvMLExperiment("titanic-exp-1", EXPROOT, stdout=False)
trainer = tu.Trainer(exp, trainset, valset, [accuracy])
trainer.metrics_log_frequency = 1

In [53]:
hparams = MyHyperparams(batch_size=16, n_epochs=20, lr=0.005, model_type="baseline")
trainer.train(hparams, build_trainer)
trainer.final_metrics

Starting run shiny-mode-2553


RuntimeError: All input tensors must be on the same device. Received cpu and cuda:0

In [None]:
analyze(exproot="/Users/avilay/temp/experiments/titanic-exp-1", run_name="soft-smoke-9636")

## More Sophisticated Models

In [None]:
class Perceptron(t.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = t.nn.Linear(8, 4)
        self.fc2 = t.nn.Linear(4, 1)
        
    def forward(self, batch_x):
        x1 = t.relu(self.fc1(batch_x))
        x2 = t.sigmoid(self.fc2(x1))
        batch_y_hat = t.squeeze(x2, dim=1)
        return batch_y_hat

In [None]:
model_factory["perceptron"] = Perceptron

In [None]:
exp = CsvMLExperiment("titanic-exp-1", "~/temp/experiments", stdout=False)
trainer = tu.Trainer(exp, trainset, valset, [accuracy])
trainer.metrics_log_frequency = 1

In [None]:
hparams = MyHyperparams(batch_size=16, n_epochs=20, lr=0.005, model_type="perceptron")
trainer.train(hparams, build_trainer)
trainer.final_metrics

In [None]:
analyze(exproot="/Users/avilay/temp/experiments/titanic-exp-1", run_name="fancy-surf-3501")

In [None]:
class WidePerceptron(t.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = t.nn.Linear(8, 100)
        self.fc2 = t.nn.Linear(100, 1)
        
    def forward(self, batch_x):
        x1 = t.relu(self.fc1(batch_x))
        x2 = t.sigmoid(self.fc2(x1))
        batch_y_hat = t.squeeze(x2, dim=1)
        return batch_y_hat
    
model_factory["wide-perceptron"] = WidePerceptron

In [None]:
exp = CsvMLExperiment("titanic-exp-1", "~/temp/experiments", stdout=False)
trainer = tu.Trainer(exp, trainset, valset, [accuracy])
trainer.metrics_log_frequency = 1

In [None]:
hparams = MyHyperparams(batch_size=16, n_epochs=20, lr=0.005, model_type="wide-perceptron")
trainer.train(hparams, build_trainer)
trainer.final_metrics

In [None]:
analyze(exproot="/Users/avilay/temp/experiments/titanic-exp-1", run_name="bold-frost-1573")

In [None]:
compare("/Users/avilay/temp/experiments/titanic-exp-1")

In [None]:
class MultiLayerPerceptron(t.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = t.nn.Linear(8, 100)
        self.fc2 = t.nn.Linear(100, 50)
        self.fc3 = t.nn.Linear(50, 1)
        
    def forward(self, batch_x):
        x1 = t.relu(self.fc1(batch_x))
        x2 = t.relu(self.fc2(x1))
        x3 = t.sigmoid(self.fc3(x2))
        batch_y_hat = t.squeeze(x3, dim=1)
        return batch_y_hat
    
model_factory["mlp"] = MultiLayerPerceptron

In [None]:
exp = CsvMLExperiment("titanic-exp-1", "~/temp/experiments", stdout=False)
trainer = tu.Trainer(exp, trainset, valset, [accuracy])
trainer.metrics_log_frequency = 1

In [None]:
hparams = MyHyperparams(batch_size=16, n_epochs=20, lr=0.005, model_type="mlp")
trainer.train(hparams, build_trainer)
trainer.final_metrics

In [None]:
analyze(exproot="/Users/avilay/temp/experiments/titanic-exp-1", run_name="autumn-dust-9186")

# Evaluate

In [None]:
test_df = pd.read_csv(path.join(DATAROOT, "test.csv"), index_col="PassengerId")
test_df.head()

In [None]:
[test_df.pop(col) for col in ["Name", "Age", "Cabin"]]
test_df.head()

In [None]:
encoded_sex = sex_enc.transform(test_df.Sex)
encoded_sex[:5]

In [None]:
encoded_embarked = embarked_enc.transform(test_df.Embarked)
encoded_embarked[:5]

In [None]:
norm_fare = ((test_df.Fare - fare_mean) / fare_std).values
norm_fare[:5]

In [None]:
test_X = test_df[["Pclass", "SibSp", "Parch"]]
test_X[:5]

In [None]:
encoded_sex = encoded_sex.reshape(-1, 1)
test_X = np.concatenate((test_X, encoded_sex), axis=1)

test_X = np.concatenate((test_X, encoded_embarked), axis=1)

norm_fare = norm_fare.reshape(-1, 1)
test_X = np.concatenate((test_X, norm_fare), axis=1)

test_X[:5]

In [None]:
testset = t.from_numpy(test_X).to(t.float32)

## Train Full Dataset
Using the best hparams identified in the training phase, lets train using all the data.

In [None]:
exp = StdoutMLExperiment("perceptron_submission")
trainer = tu.Trainer(exp, dataset, valset, [accuracy])
trainer.metrics_log_frequency = 100000
hparams = MyHyperparams(batch_size=16, n_epochs=20, lr=0.005, model_type="mlp")
trainer.train(hparams, build_trainer)
trainer.final_metrics

In [None]:
t.save(trainer.model, "./mlp.pkl")

## Generate Submission

In [None]:
test_y_hat = trainer.model(testset)
test_y_pred = (test_y_hat > 0.5).to(t.int64)
test_df["Survived"] = test_y_pred

In [None]:
test_df[["Survived"]].to_csv("/Users/avilay/mldata/titanic/mlp_submission.csv")

## Results

  * Baseline submission: 0.76555
  * Perceptron submission: 0.75837
  * Wide Perceptron submission: 0.77511
  * MLP: 0.76315