# Kaggle Titanic Dataset

In [76]:
from __future__ import print_function
import pandas as pd
import numpy as np
import torch
import os

## Data import

In [77]:
data_dir = os.path.join(os.getcwd(), 'data')
train_csv_fp = os.path.join(data_dir, 'train.csv')
raw_data = pd.read_csv(train_csv_fp)

## Data Analysis

In [78]:
raw_data['Cabin'].isna().sum()

687

In [79]:
raw_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [80]:
# Remove NaN values
raw_data.drop(columns = ['PassengerId','Name', 'Ticket', 'Cabin'], inplace = True)
raw_data.fillna({'Age' : raw_data['Age'].mean()}, inplace = True)

In [81]:
raw_data.describe() # Missing values in Age (NaN)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [82]:
raw_data.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [83]:
# Simple Binary Encoding
cleanup_arr = {
    'Sex' : {
        "male" : 0,
        "female" : 1
    }
}

raw_data.replace(cleanup_arr, inplace = True)

# One Hot Encoding
raw_data = pd.get_dummies(raw_data, columns = ['Embarked'], prefix=['Embarked'])

######### END OF DATA PREPROCESSING ##########

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.000000,1,0,7.2500,0,0,1
1,1,1,1,38.000000,1,0,71.2833,1,0,0
2,1,3,1,26.000000,0,0,7.9250,0,0,1
3,1,1,1,35.000000,1,0,53.1000,0,0,1
4,0,3,0,35.000000,0,0,8.0500,0,0,1
5,0,3,0,29.699118,0,0,8.4583,0,1,0
6,0,1,0,54.000000,0,0,51.8625,0,0,1
7,0,3,0,2.000000,3,1,21.0750,0,0,1
8,1,3,1,27.000000,0,2,11.1333,0,0,1
9,1,2,1,14.000000,1,0,30.0708,1,0,0


# # PyTorch set up

In [88]:
from sklearn.model_selection import train_test_split

def split_stratified_into_train_val_test(df_input, stratify_colname='y',
                                         frac_train=0.6, frac_val=0.15, frac_test=0.25,
                                         random_state=None):
    '''
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    '''

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' % \
                         (frac_train, frac_val, frac_test))

    if stratify_colname not in df_input.columns:
        raise ValueError('%s is not a column in the dataframe' % (stratify_colname))

    X = df_input # Contains all columns.
    y = df_input[[stratify_colname]] # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(X,
                                                          y,
                                                          stratify=y,
                                                          test_size=(1.0 - frac_train),
                                                          random_state=random_state)

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(df_temp,
                                                      y_temp,
                                                      stratify=y_temp,
                                                      test_size=relative_frac_test,
                                                      random_state=random_state)

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test

In [91]:
# Split Raw data into Train, Test and Validation set
df_train, df_val, df_test = split_stratified_into_train_val_test(raw_data, 
                                                                 stratify_colname='Survived', 
                                                                 frac_train=0.7, frac_val=0.15, 
                                                                 frac_test=0.15)


623 134 134


In [95]:
# Check if CUDA is available
x = torch.randn(1)
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       # ``.to`` can also change dtype together!

tensor([1.7963], device='cuda:0')
tensor([1.7963], dtype=torch.float64)


In [101]:
# Split sets into x and y
df_train_x = df_train.iloc[:, 1:]
df_train_y = df_train.iloc[:, :1]
df_val_x = df_val.iloc[:, 1:]
df_val_y = df_val.iloc[:, :1]
df_test_x = df_test.iloc[:, 1:]
df_test_y = df_test.iloc[:, :1]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
129,3,0,45.000000,0,0,6.9750,0,0,1
875,3,1,15.000000,0,0,7.2250,1,0,0
764,3,0,16.000000,0,0,7.7750,0,0,1
364,3,0,29.699118,1,0,15.5000,0,1,0
104,3,0,37.000000,2,0,7.9250,0,0,1
198,3,1,29.699118,0,0,7.7500,0,1,0
862,1,1,48.000000,0,0,25.9292,0,0,1
709,3,0,29.699118,1,1,15.2458,1,0,0
348,3,0,3.000000,1,1,15.9000,0,0,1
886,2,0,27.000000,0,0,13.0000,0,0,1


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
#         self.conv1 = nn.Conv2d(1, 6, 3)
#         self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(9, 64)  # 6*6 from image dimension
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 2)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

# NOTES
# torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, 
#                 padding=0, dilation=1, groups=1, bias=True, 
#                 padding_mode='zeros')