In [1]:
import torch
from torch.utils.data import Dataset
import wandb
import pandas as pd

In [2]:
#!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mableal[0m ([33mwandb-smle[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
## Mod data to fit model
##### Drop "Names"
##### Drop "Cabin"
##### Drop "PassengerId"
##### Drop "Ticket"
##### Drop "Embarked"
##### Numericalize "Sex"
##### BUcketize "Age" and fillna
##### Generate train/valid/test splits

from sklearn.model_selection import train_test_split


train_csv = pd.read_csv("./titanic/train.csv", index_col='PassengerId')
test_csv = pd.read_csv("./titanic/test.csv", index_col='PassengerId')
train_csv = train_csv.drop('Name', axis=1)
train_csv = train_csv.drop('Cabin', axis=1)
train_csv = train_csv.drop('Ticket', axis=1)
train_csv = train_csv.drop('Embarked', axis=1)
train_csv['Sex'] = train_csv['Sex'].astype('category')
train_csv['Sex'] = train_csv['Sex'].cat.codes
bins= [0,18,40,60,100]
labels = [0,1,2,3]
train_csv['Age'] = train_csv['Age'].fillna(train_csv['Age'].mean())
train_csv['Age'] = pd.cut(train_csv['Age'], bins=bins, labels=labels, right=False)

train, test = train_test_split(train_csv, test_size=0.2)
train, valid = train_test_split(train, test_size=0.2)

train.describe()
### Now we have train, validation, and test splits

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare
count,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.386643,2.332162,0.643234,0.56239,0.390158,32.81668
std,0.487409,0.830982,0.479466,1.176378,0.821568,51.304312
min,0.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,0.0,0.0,7.8958
50%,0.0,3.0,1.0,0.0,0.0,14.4583
75%,1.0,3.0,1.0,1.0,0.0,31.275
max,1.0,3.0,1.0,8.0,5.0,512.3292


In [4]:
#Initialize this run
config = { 'batchSize': 64, 'num_epochs': 100, 'lr': 0.01}

run = wandb.init(entity="wandb-smle",
        project="aleal-kaggle-titanic", save_code=True,
                 group="debug", force=True, config=config)

[34m[1mwandb[0m: Currently logged in as: [33mableal[0m ([33mwandb-smle[0m). Use [1m`wandb login --relogin`[0m to force relogin
fatal: ambiguous argument 'HEAD': unknown revision or path not in the working tree.
Use '--' to separate paths from revisions, like this:
'git <command> [<revision>...] -- [<file>...]'


In [5]:
#save source data to W&B
test_table = wandb.Table(dataframe=test_csv)
train_table = wandb.Table(dataframe=train_csv)
data_art = wandb.Artifact(name="titatinc_artifacts", type="dataset")
data_art.add_file("./titanic/train.csv")
data_art.add_file("./titanic/test.csv")
data_art.add(train_table, "train_table")
data_art.add(test_table, "test_table")

run.log_artifact(data_art)

<Artifact titatinc_artifacts>

In [6]:
import os
import tensorflow as tf
import numpy

class CustomTitanicDataset(Dataset):
    def __init__(self, df, transform=None, target_transform=None):
        self.data: pd.DataFrame
        self.data = df
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        row = self.data.iloc[idx]
            
        datapoint = torch.tensor(row.iloc[1:7].values, dtype=torch.float32)
        label = torch.tensor(row.iloc[0], dtype=torch.float32)
        
        if self.transform:
            dp = self.transform(datapoint)
        if self.target_transform:
            label = self.target_transform(label)

        return datapoint, label

import torch.nn as nn
import torch.utils.data

device=torch.device("cpu")

class binaryModel(nn.Module):
    def __init__(self):
        super(binaryModel, self).__init__()
        self.hidden = nn.Linear(6, 100)
        self.relu = nn.ReLU()
        self.output = nn.Linear(100, 1)
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        x = self.hidden(x)
        x = self.relu(x)
        x = self.output(x)
        x = self.sigmoid(x)
        return x

model = binaryModel()

model.to(device)
run.watch(model)


In [7]:
## Trainer
import torch.utils.data
from torchvision import transforms


train_ds = CustomTitanicDataset(train)
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=run.config.batchSize, shuffle=True)

valid_ds = CustomTitanicDataset(valid)
valid_dataloader = torch.utils.data.DataLoader(valid_ds, batch_size=run.config.batchSize, shuffle=True)

test_ds = CustomTitanicDataset(test)
test_dataloader = torch.utils.data.DataLoader(test_ds)


In [8]:
## Training and Validation Loop

loss_fn = nn.BCELoss()
loss_fn = loss_fn.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=run.config.lr)

for epoch in range(run.config.num_epochs):
    train_total_correct = 0
    train_total_samples = 0
    train_running_loss = 0

    valid_total_correct = 0
    valid_total_samples = 0
    valid_running_loss = 0
    
    model.train()
    for idx, (data, label) in enumerate(train_dataloader):
        data.to(device)
        label.to(device)
        
        output = model(data)
        loss = loss_fn(output, label.unsqueeze(1))
        optimizer.zero_grad()
        _, predicted = torch.max(output, 1)
        loss.backward()
        optimizer.step()
        
        train_total_correct += (predicted == label).sum().item()
        train_total_samples += label.size(0)
        train_running_loss += loss.item()

    model.eval()
    with torch.no_grad():
        for idx, (data, label) in enumerate(valid_dataloader):
            data.to(device)
            label.to(device)
            
            output = model(data)
            loss = loss_fn(output, label.unsqueeze(1))
            _, predicted = torch.max(output, 1)
            
            valid_total_correct += (predicted == label).sum().item()
            valid_total_samples += label.size(0)
            valid_running_loss += loss.item()

    train_accuracy = train_total_correct / train_total_samples
    train_loss = train_running_loss / len(train_ds)
    valid_accuracy = valid_total_correct / valid_total_samples
    valid_loss = valid_running_loss / len(valid_ds)
    run.log({"train_acc": train_accuracy, "train_loss": train_loss}, step=epoch)
    run.log({"valid_acc": valid_accuracy, "valid_loss": valid_loss}, step=epoch)


In [9]:
test_output_table = wandb.Table(columns=["Id", "In_Pclass", "In_Sex", "In_SibSp", "In_Parch", 
                                         "In_Fare", "Prediction", "Ground_Truth", "Pred_Probability"])

total_correct = 0
total_tests = 0

for idx, (data, label) in enumerate(test_dataloader):
    data.to(device)
    label.to(device)

    output = model(data)
    prediction = output.squeeze(dim=0).item()
    total_tests += 1

    num_pred = 0
    prob = 0

    if (prediction > 0.5):
        num_pred = 1
        prob = prediction
    else:
        prob = 1-prediction

    curr_row = test.iloc[idx]
    test_output_table.add_data(idx, curr_row['Pclass'], curr_row['Sex'], curr_row['SibSp']
                              , curr_row['Parch'], curr_row['Fare'], num_pred, curr_row['Survived'], prob)

    if (num_pred == curr_row['Survived']):
        total_correct += 1
    
run.log({"test_acc": total_correct / total_tests})

    

In [10]:
run.log({"predictions_table": test_output_table})
run.finish()

VBox(children=(Label(value='0.076 MB of 0.076 MB uploaded (0.020 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test_acc,▁
train_acc,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_acc,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
test_acc,0.79888
train_acc,0.61336
train_loss,0.00665
valid_acc,0.63636
valid_loss,0.00915
