In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader

### Improved Neural Network
The following model is the base for all other models created by me.
It provides following functions:
##### compile
Provides essential parameters for training the model.
###### Parameters:
- loss_function
- optimizer
- batch_size
- device - device on which model is trained and makes predictions (using cuda for faster computations is highly recommended)

##### train_on_data
Trains the model. After the training is complited, it prints "Done" message.
###### Parameters:
- train_dataset - PyTorch's Dataset for model training
- test_dataset - PyTorch's Dataset for model validation
- n_epochs - number of epochs for the data to be trained
- print_step - frequency of printing loss for the current batch

##### predict
Makes prediction for given data.
##### Parameters:
- test_data - PyTorch's Dataset containing data our model is to use to make prediction 

In [3]:
class ImprovedNeuralNetwork(nn.Module):
    def __train_loop(self, dataloader):
        self.train()
        for X, y in dataloader:
            X, y = X.to(self.device), y.to(self.device)
            predictions = self(X)
            loss = self.loss_function(
                predictions, 
                torch.unsqueeze(y, 1)
            )
            # Backpropagation
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
    def __test_loop(self, dataloader):
        self.eval()
        test_loss = 0
        with torch.no_grad():
            for X, y in dataloader:
                X, y = X.to(self.device), y.to(self.device)
                predictions = self(X)
                test_loss += self.loss_function(
                    predictions, 
                    torch.unsqueeze(y, 1)
                )
        # Dividing test_loss by number of batches
        test_loss /= len(dataloader)
        return test_loss
    
    def compile(self, loss_function, optimizer, batch_size, device):
        self.loss_function = loss_function
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.device = device
        self.to(device)
        
    def train_on_data(self, train_dataset, test_dataset, n_epochs, print_step = 1):
        train_dataloader = DataLoader(
            train_dataset,
            batch_size = self.batch_size,
            shuffle = True
        )
        test_dataloader = DataLoader(
            test_dataset,
            batch_size = self.batch_size,
            shuffle = True
        )

        self.to(self.device)
        for i in range(n_epochs):
            self.__train_loop(train_dataloader)
            test_loss = self.__test_loop(test_dataloader)
            if (i + 1) % print_step == 0:
              print("Epoch", i, ", loss:", test_loss)
        print("Done.")
        self.to('cpu')


    def predict(self, test_dataset):
        self.to(self.device)
        with torch.no_grad():
            predictions = self(test_dataset)
        self.to('cpu')
        return predictions

### Simplified Neural Network
The following model simplifies the process of creating neural network. To create the model, one has to provide following paramters:
- input_shape - the input's shape
- layer_sizes - list of numer of neurons in each layer
The the creates a series of layers of neurons connected by PyTorch's GELU activation function.
I used GELU instead of ReLU, because it was proved to yield better results, while not decreasing the computational time significantly (also it has nice robabilistic interpretation).

In [4]:
class SimplifiedNeuralNetwork(ImprovedNeuralNetwork):
    def __init__(self, input_shape, layer_sizes):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_shape, layer_sizes[0]),
        )
        for i in range(1, len(layer_sizes)):
            self.linear_relu_stack.append(nn.GELU())
            self.linear_relu_stack.append(
                nn.Linear(layer_sizes[i - 1], layer_sizes[i])
            )
            
    def forward(self, x):
        x = self.flatten(x)
        return self.linear_relu_stack(x)

### BinaryRegressor
This is basically SimplifiedNeuralNetwork, but after passing data through the neural network, it passes it additionally through the PyTorch's sigmoid layer, so that each number it produces is in range [0, 1].

In [6]:
class BinaryRegressor(SimplifiedNeuralNetwork):
    def __init__(self, input_shape, layer_sizes):
        super().__init__(input_shape, layer_sizes)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = super().forward(x)
        return self.sigmoid(x)

### Usage example

Below I present an example of how to use the BinaryRegressor class (the presentation encompasses all methods that were added by me in other classes as well).
For the presentation I will use data from Titanic Competition that was held on kaggle.com.

In [5]:
import random
import pandas as pd
import numpy as np

In [6]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.set_index('PassengerId', inplace = True)
test_data.set_index('PassengerId', inplace = True)

train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
train_data.shape[0]

891

In [8]:
family_dict = {}
def feature_engineer(dataframe, calculate_family_stats = False):
    dataframe['LogFare'] = np.log(dataframe['Fare'] + 1)
    dataframe['ScaledAge'] = dataframe['Age'] / 100
    dataframe['SecondName'] = dataframe['Name'].str.split().str[0]
    dataframe['Position'] = dataframe['Name'].str.split().str[1]
    
    for i in dataframe.index:
        name = dataframe.Name[i]
        for s in name.split():
            if s[-1] == '.':
                dataframe.loc[i, 'Position'] = s
    
    #Setting average survival rate for a family
    if calculate_family_stats:
        for s in dataframe.SecondName:
            family_dict[s] = [0, 0]
        for i in dataframe.index:
            s = dataframe.loc[i, 'SecondName']
            family_dict[s][1] += 1
            family_dict[s][0] += dataframe.loc[i, 'Survived']
    
    dataframe['FamilySurvivalRate'] = 0.5
    dataframe['UnknownFamily'] = 0
    for i in dataframe.index:
        second_name = dataframe.loc[i, 'SecondName']
        if second_name in family_dict.keys():
            a, b = family_dict[second_name]
            dataframe.loc[i, 'FamilySurvivalRate'] = a / b
        else:
            dataframe.loc[i, 'UnknownFamily'] = 1

In [9]:
feature_engineer(train_data, True)
feature_engineer(test_data)
test_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LogFare,ScaledAge,SecondName,Position,FamilySurvivalRate,UnknownFamily
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,2.178064,0.345,"Kelly,",Mr.,0.75,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2.079442,0.47,"Wilkes,",Mrs.,0.5,1
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,2.369075,0.62,"Myles,",Mr.,0.5,1
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,2.268252,0.27,"Wirz,",Mr.,0.5,1
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,2.586824,0.22,"Hirvonen,",Mrs.,1.0,0


In [10]:
train_data.dropna(axis = 'index', subset = ['Survived'], inplace = True)
y = train_data.Survived
X_full = train_data.drop(['Survived'], axis = 'columns')
X_test = test_data
missing_data = [col for col in X_full.columns if X_full[col].isnull().any()]
print(missing_data)
print((X_full.Cabin.value_counts() > 1).value_counts())

['Age', 'Cabin', 'Embarked', 'ScaledAge']
count
False    101
True      46
Name: count, dtype: int64


In [11]:
def data_split(dataframe):
    y = dataframe.Survived
    X = dataframe.drop(['Survived'], axis = 'columns')
    return X, y

In [12]:
number_of_families = train_data.SecondName.nunique()
print(number_of_families)
family_list = list(train_data.SecondName.unique())
print(family_list[0:5])
one_person_families = []
for i in train_data.index:
    if (train_data.loc[i, 'SecondName'] == train_data.SecondName).sum() == 1:
        one_person_families.append(i)
        train_data.loc[i, 'UnknownFamily'] = 1
        train_data.loc[i, 'FamilySurvivalRate'] = 0.5

random_subset = random.sample(range(len(one_person_families)), train_data.shape[0] // 5)
valid_indexes = [one_person_families[x] for x in random_subset]
valid = train_data.loc[valid_indexes, :]
X_full = train_data.drop(valid.index)
valid.head()
print(valid.shape, X_full.shape)

661
['Braund,', 'Cumings,', 'Heikkinen,', 'Futrelle,', 'Allen,']
(178, 17) (713, 17)


In [13]:
numerical_features = ['Pclass', 'Parch', 'LogFare', 'FamilySurvivalRate', 'UnknownFamily']
categorical_features = []#'Sex', 'Embarked', 'Position']
features = numerical_features + categorical_features

X_full = X_full[features + ['Survived']]
X_test = X_test[features]
valid = valid[features + ['Survived']]
print(X_full.shape)
X_train, y_train = data_split(X_full)
X_valid, y_valid = data_split(valid)
print(X_train.shape)

(713, 6)
(713, 5)


In [14]:
input_shape = X_train.shape[1]
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(713, 5) (178, 5) (713,) (178,)


In [15]:
X_train.head()

Unnamed: 0_level_0,Pclass,Parch,LogFare,FamilySurvivalRate,UnknownFamily
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3,0,2.110213,0.0,0
2,1,0,4.280593,0.5,1
3,3,0,2.188856,0.5,1
4,1,0,3.990834,0.5,0
5,3,0,2.202765,0.5,0


In [85]:
class BinaryClassifier(SimplifiedNeuralNetwork):
    def __init__(self, input_shape, layer_sizes):
        super().__init__(input_shape, layer_sizes)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = super().forward(x)
        return self.sigmoid(x)

In [90]:
model = BinaryClassifier(input_shape, [512, 256, 128, 64, 1])
model.compile(
    loss_function = nn.BCELoss(), 
    optimizer = torch.optim.SGD(model.parameters(), lr = 0.3),
    batch_size = 64
)

In [91]:
from torch.utils.data import TensorDataset

train_dataset = TensorDataset(
    torch.tensor(X_train.values, dtype = torch.float),
    torch.tensor(y_train.values, dtype = torch.float)
)

valid_dataset = TensorDataset(
    torch.tensor(X_valid.values, dtype = torch.float),
    torch.tensor(y_valid.values, dtype = torch.float)
)

In [111]:
model.train_on_data(train_dataset, valid_dataset, 8)

Epoch 0 , loss: tensor(0.5686)
Epoch 1 , loss: tensor(0.6005)
Epoch 2 , loss: tensor(0.7812)
Epoch 3 , loss: tensor(0.6413)
Epoch 4 , loss: tensor(0.5527)
Epoch 5 , loss: tensor(0.5567)
Epoch 6 , loss: tensor(0.6149)
Epoch 7 , loss: tensor(0.6859)
Done.


In [105]:
probabilities = model.forward(torch.tensor(X_test.values, dtype = torch.float))
numpy_probabilities = probabilities.detach().squeeze(1).numpy()
predictions = []
for i in range(len(numpy_probabilities)):
    if numpy_probabilities[i] >= 0.5:
        predictions.append(1)
    else:
        predictions.append(0)

In [107]:
output = pd.DataFrame({'PassengerId': test_data.index,
                       'Survived': predictions})
output.to_csv('submission.csv', index=False)